加载本地模型

这里加载的是量化版本，是onnx-community/DeepSeek-R1-Distill-Qwen-1.5B-ONNX

本地硬件：

内存 32G
硬盘 1T
CPU i7-10610U
显卡 Intel(R) UHD Graphics 128MB
操作系统 Win10

用cpu跑的，巨慢。用gpu可能快些吧。

WEB环境

vite7
vue3

使用库：@huggingface/transformers

先创建一个vite项目demo

bash

npm create vite@latest test-model-deepseek-r1-1.5b -- --template vue

安装依赖

使用vscode打开项目，打开终端

bash

npm i @huggingface/transformers

把下载的onnx文件拷贝到public文件夹下

具体文件目录为：public/models/onnx-community/DeepSeek-R1-Distill-Qwen-1.5B-ONNX

配置vite.config.js

主要是增加跨域配置

import { defineConfig } from 'vite'
import vue from '@vitejs/plugin-vue'


// https://vite.dev/config/
export default defineConfig({
  plugins: [vue()],
  // 关键配置1：让 Vite 将 .onnx 文件视为静态资源，而不是JS模块 [citation:1]
  assetsInclude: ['**/*.onnx'],

  // 关键配置2：排除 Transformers.js 依赖预打包，避免处理WASM时出错 [citation:1][citation:6]
  optimizeDeps: {
    exclude: ['@huggingface/transformers'],
  },

  server: {
    headers: {
      // 关键配置3：设置 COOP/COEP 头，这是 WASM 多线程支持所必需的 [citation:1]
      'Cross-Origin-Opener-Policy': 'same-origin',
      'Cross-Origin-Embedder-Policy': 'require-corp',
    },
  },
})

替换App.vue文件

vue

<template>
  <div class="container">
    <h1>DeepSeek-R1 本地推理 (优化版)</h1>

    <div class="status-bar">
      <span>状态: {{ statusText }}</span>
      <span v-if="tokensPerSecond > 0" class="speed-badge">
        速度: {{ tokensPerSecond.toFixed(1) }} tokens/s
      </span>
    </div>

    <div class="chat-container">
      <div class="messages" ref="messagesBox">
        <div v-for="(msg, idx) in messages" :key="idx" :class="['msg', msg.role]">
          <div class="msg-content">{{ msg.content }}</div>
        </div>
        <div v-if="isGenerating" class="msg assistant">
          <div class="msg-content typing">正在思考... <span class="cursor">|</span></div>
        </div>
      </div>

      <div class="input-area">
        <textarea v-model="inputText" placeholder="输入问题..." @keydown.enter.prevent="sendMessage"
          :disabled="isGenerating"></textarea>
        <button @click="sendMessage" :disabled="isGenerating || !isModelReady">
          {{ isGenerating ? '生成中...' : '发送' }}
        </button>
      </div>

      <div class="controls">
        <label>
          <input type="checkbox" v-model="useWebGPU" @change="resetModel" />
          启用 WebGPU (需显卡支持，速度更快)
        </label>
      </div>
    </div>
  </div>
</template>

<script setup>
import { ref, nextTick, onMounted } from 'vue';
import { pipeline, env } from '@huggingface/transformers';

// --- 配置 ---
env.allowLocalModels = true;
env.localModelPath = '/models/';

// --- 状态 ---
const messages = ref([]);
const inputText = ref('你好');
const statusText = ref('点击发送加载模型');
const isGenerating = ref(false);
const isModelReady = ref(false);
const useWebGPU = ref(false); // 默认关闭，建议有能力用户开启
const tokensPerSecond = ref(0);
const messagesBox = ref(null);

let generator = null;

// --- 核心逻辑：加载模型 ---
const loadModel = async () => {
  if (generator) return;

  try {
    statusText.value = '正在加载模型... (首次可能较慢)';

    // 根据 WebGPU 开关选择配置
    const config = {
      progress_callback: (p) => {
        if (p.status === 'progress') {
          statusText.value = `加载模型: ${Math.round(p.progress)}%`;
        }
      },
      // 【关键优化 1】执行后端选择
      execution_provider: useWebGPU.value ? 'webgpu' : 'wasm',

      // 【关键优化 2】模型精度选择
      // WebGPU 推荐 q4f16 (快，显存占用低)
      // WASM (CPU) 推荐 q8 (稳定，兼容性好)
      dtype: useWebGPU.value ? 'q4f16' : 'q8',
    };

    generator = await pipeline('text-generation', 'onnx-community/DeepSeek-R1-Distill-Qwen-1.5B-ONNX', config);

    isModelReady.value = true;
    statusText.value = useWebGPU.value ? 'WebGPU 模式就绪' : 'CPU (WASM) 模式就绪';
  } catch (e) {
    console.error(e);
    statusText.value = `加载失败: ${e.message}`;
    // 如果 WebGPU 失败，提示用户关闭
    if (useWebGPU.value) {
      alert('WebGPU 加载失败，可能是浏览器不支持或显卡驱动问题。将切换回 CPU 模式。');
      useWebGPU.value = false;
      generator = null;
      await loadModel(); // 递归尝试 CPU 模式
    }
  }
};

// --- 核心逻辑：发送消息 ---
const sendMessage = async () => {
  if (!inputText.value.trim() || isGenerating.value) return;

  // 1. 准备
  if (!generator) await loadModel();
  if (!generator) return;

  const userText = inputText.value;
  messages.value.push({ role: 'user', content: userText });
  inputText.value = '';
  isGenerating.value = true;

  // 2. 添加一个空的助手消息用于流式填充
  messages.value.push({ role: 'assistant', content: '' });
  const currentMsgIndex = messages.value.length - 1;

  // 3. 性能统计
  const startTime = performance.now();
  let tokenCount = 0;

  try {
    // 【关键优化 3】流式生成
    const stream = await generator(userText, {
      max_new_tokens: 128, // 适当控制长度
      temperature: 0.7,
      top_p: 0.9,
      do_sample: true,
      stream: true, // 开启流式输出！
    });

    // 4. 逐个 Token 处理
    for await (const chunk of stream) {
      // chunk 结构通常是 { token: number, text: string, ... }
      // 有些版本的 output 结构可能略有不同，这里做兼容
      const newText = chunk.token?.text || chunk.generated_text || '';

      if (newText) {
        // 更新最后一条消息
        messages.value[currentMsgIndex].content += newText;
        tokenCount++;

        // 自动滚动到底部
        await nextTick();
        if (messagesBox.value) {
          messagesBox.value.scrollTop = messagesBox.value.scrollHeight;
        }
      }
    }

    // 5. 计算速度
    const endTime = performance.now();
    const duration = (endTime - startTime) / 1000; // 秒
    tokensPerSecond.value = tokenCount / duration;

  } catch (e) {
    console.error(e);
    messages.value[currentMsgIndex].content = `生成出错: ${e.message}`;
  } finally {
    isGenerating.value = false;
  }
};

const resetModel = () => {
  generator = null;
  isModelReady.value = false;
  statusText.value = '模型已重置，请点击发送重新加载';
  loadModel();
};

onMounted(() => {
  loadModel();
})


</script>

<style scoped>
.container {
  max-width: 800px;
  margin: 0 auto;
  padding: 20px;
  font-family: 'Segoe UI', sans-serif;
}

.status-bar {
  display: flex;
  justify-content: space-between;
  padding: 10px;
  background: #f0f2f5;
  border-radius: 8px;
  margin-bottom: 15px;
  font-size: 14px;
}

.speed-badge {
  background: #28a745;
  color: white;
  padding: 2px 8px;
  border-radius: 12px;
  font-size: 12px;
}

.chat-container {
  border: 1px solid #ddd;
  border-radius: 12px;
  overflow: hidden;
}

.messages {
  height: 400px;
  overflow-y: auto;
  padding: 15px;
  background: #fff;
}

.msg {
  margin-bottom: 12px;
  display: flex;
}

.msg.user {
  justify-content: flex-end;
}

.msg-content {
  padding: 10px 15px;
  border-radius: 18px;
  max-width: 70%;
  white-space: pre-wrap;
  line-height: 1.5;
}

.msg.user .msg-content {
  background: #007bff;
  color: white;
  border-bottom-right-radius: 4px;
}

.msg.assistant .msg-content {
  background: #f1f3f5;
  color: black;
  border-bottom-left-radius: 4px;
}

.typing .cursor {
  animation: blink 1s infinite;
}

@keyframes blink {

  0%,
  50% {
    opacity: 1;
  }

  51%,
  100% {
    opacity: 0;
  }
}

.input-area {
  display: flex;
  padding: 15px;
  border-top: 1px solid #eee;
  background: #fafafa;
}

textarea {
  flex: 1;
  resize: none;
  border: 1px solid #ccc;
  border-radius: 8px;
  padding: 10px;
  margin-right: 10px;
  height: 40px;
  font-size: 16px;
}

button {
  background: #007bff;
  color: white;
  border: none;
  padding: 0 20px;
  border-radius: 8px;
  cursor: pointer;
  font-weight: bold;
}

button:disabled {
  background: #ccc;
  cursor: not-allowed;
}

.controls {
  padding: 10px 15px;
  font-size: 13px;
  color: #666;
  border-top: 1px solid #eee;
}
</style>

运行项目后，加载大模型需要1到3分钟。

发送消息，2到3分钟回答。

代码说明

App.vue文件中，关键代码

env

javascript

import { pipeline, env } from '@huggingface/transformers';

// --- 配置 ---
env.allowLocalModels = true;
env.localModelPath = '/models/';

这是配置可以从本地加载模型，设置本地加载的目录是/models。

env的配置可以在官方文档找到。

pipeline

javascript

// 根据 WebGPU 开关选择配置
    const config = {
      progress_callback: (p) => {
        if (p.status === 'progress') {
          statusText.value = `加载模型: ${Math.round(p.progress)}%`;
        }
      },
      // 【关键优化 1】执行后端选择
      execution_provider: useWebGPU.value ? 'webgpu' : 'wasm',

      // 【关键优化 2】模型精度选择
      // WebGPU 推荐 q4f16 (快，显存占用低)
      // WASM (CPU) 推荐 q8 (稳定，兼容性好)
      dtype: useWebGPU.value ? 'q4f16' : 'q8',
    };

    generator = await pipeline('text-generation', 'onnx-community/DeepSeek-R1-Distill-Qwen-1.5B-ONNX', config);

pipeline 第一个参数是任务，有多种任务可选，官网文档有介绍任务类型

第二个参数是模型ID，也就是上面配置的本地加载目录/models文件夹下的对应目录，比如onnx-community/DeepSeek-R1-Distill-Qwen-1.5B-ONNX，这个目录再往下，配置文件在外面，onnx文件夹里面是模型文件。

第三个参数是pipeline的选项，在官网找不到，代码中可以看到一些。

@param {import('./utils/hub.js').PretrainedModelOptions} [options] Optional parameters for the pipeline.
progress_callback = null,
config = null,
cache_dir = null,
local_files_only = false,
revision = 'main',
device = null,
dtype = null,
subfolder = 'onnx',
use_external_data_format = null,
model_file_name = null,
session_options = {},

在utils/hub.js中找到注释了

javascript

/**
 * @typedef {Object} PretrainedOptions Options for loading a pretrained model.
 * @property {import('./core.js').ProgressCallback} [progress_callback=null] If specified, this function will be called during model construction, to provide the user with progress updates.
 * @property {import('../configs.js').PretrainedConfig} [config=null] Configuration for the model to use instead of an automatically loaded configuration. Configuration can be automatically loaded when:
 * - The model is a model provided by the library (loaded with the *model id* string of a pretrained model).
 * - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a configuration JSON file named *config.json* is found in the directory.
 * @property {string} [cache_dir=null] Path to a directory in which a downloaded pretrained model configuration should be cached if the standard cache should not be used.
 * @property {boolean} [local_files_only=false] Whether or not to only look at local files (e.g., not try downloading the model).
 * @property {string} [revision='main'] The specific model version to use. It can be a branch name, a tag name, or a commit id,
 * since we use a git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier allowed by git.
 * NOTE: This setting is ignored for local requests.
 */

/**
 * @typedef {Object} ModelSpecificPretrainedOptions Options for loading a pretrained model.
 * @property {string} [subfolder='onnx'] In case the relevant files are located inside a subfolder of the model repo on huggingface.co,
 * you can specify the folder name here.
 * @property {string} [model_file_name=null] If specified, load the model with this name (excluding the .onnx suffix). Currently only valid for encoder- or decoder-only models.
 * @property {import("./devices.js").DeviceType|Record<string, import("./devices.js").DeviceType>} [device=null] The device to run the model on. If not specified, the device will be chosen from the environment settings.
 * @property {import("./dtypes.js").DataType|Record<string, import("./dtypes.js").DataType>} [dtype=null] The data type to use for the model. If not specified, the data type will be chosen from the environment settings.
 * @property {ExternalData|Record<string, ExternalData>} [use_external_data_format=false] Whether to load the model using the external data format (used for models >= 2GB in size).
 * @property {import('onnxruntime-common').InferenceSession.SessionOptions} [session_options] (Optional) User-specified session options passed to the runtime. If not provided, suitable defaults will be chosen.
 */

/**
 * @typedef {PretrainedOptions & ModelSpecificPretrainedOptions} PretrainedModelOptions Options for loading a pretrained model.
 */

其中dtype表示是要加载模型，progress_callback表示加载的回调。

加载本地模型 ​

WEB环境 ​

代码说明 ​

env ​

pipeline ​

Node环境 ​

加载本地模型

WEB环境

代码说明

env

pipeline

Node环境