Node.js AI应用开发基础
📖 概述
本文全面介绍如何使用Node.js开发AI应用,从环境搭建到实际项目实现。涵盖主流AI API集成、本地模型部署、数据处理、性能优化等核心技术。通过丰富的代码示例和实战项目,帮助开发者快速掌握Node.js AI应用开发的最佳实践。
🚀 Node.js AI开发环境搭建
基础环境配置
| 组件 | 版本要求 | 安装方式 | 用途说明 |
|---|---|---|---|
| Node.js | >=18.0.0 | 官网下载/nvm | JavaScript运行环境 |
| npm/yarn | 最新版本 | 随Node.js安装 | 包管理工具 |
| TypeScript | >=4.5.0 | npm install -g typescript | 类型安全开发 |
| Python | >=3.8.0 | 官网下载/pyenv | 机器学习库支持 |
核心依赖包
| 包名 | 版本 | 功能描述 | 使用场景 |
|---|---|---|---|
| @anthropic-ai/sdk | ^0.24.0 | Claude API客户端 | 对话和文本生成 |
| openai | ^4.0.0 | OpenAI API客户端 | GPT模型调用 |
| @google/generative-ai | ^0.15.0 | Gemini API客户端 | Google AI服务 |
| langchain | ^0.2.0 | AI应用开发框架 | 复杂AI工作流 |
| @tensorflow/tfjs-node | ^4.0.0 | TensorFlow.js | 本地模型推理 |
| sharp | ^0.33.0 | 图像处理 | 图像预处理 |
| pdf-parse | ^1.1.1 | PDF解析 | 文档处理 |
项目初始化
// package.json 配置示例
{
"name": "nodejs-ai-app",
"version": "1.0.0",
"description": "Node.js AI应用开发基础项目",
"main": "dist/index.js",
"scripts": {
"dev": "tsx watch src/index.ts",
"build": "tsc",
"start": "node dist/index.js",
"test": "jest",
"lint": "eslint src/**/*.ts",
"format": "prettier --write src/**/*.ts"
},
"dependencies": {
"@anthropic-ai/sdk": "^0.24.0",
"openai": "^4.0.0",
"@google/generative-ai": "^0.15.0",
"langchain": "^0.2.0",
"express": "^4.18.0",
"cors": "^2.8.5",
"helmet": "^7.0.0",
"dotenv": "^16.0.0",
"winston": "^3.8.0",
"joi": "^17.9.0",
"rate-limiter-flexible": "^3.0.0"
},
"devDependencies": {
"@types/node": "^20.0.0",
"@types/express": "^4.17.0",
"typescript": "^5.0.0",
"tsx": "^4.0.0",
"jest": "^29.0.0",
"@types/jest": "^29.0.0",
"eslint": "^8.0.0",
"prettier": "^3.0.0"
},
"engines": {
"node": ">=18.0.0"
}
}
🔧 AI API集成实战
统一AI客户端管理器
const fs = require('fs');
const path = require('path');
const { Anthropic } = require('@anthropic-ai/sdk');
const { OpenAI } = require('openai');
const { GoogleGenerativeAI } = require('@google/generative-ai');
const winston = require('winston');
// 配置日志
const logger = winston.createLogger({
level: 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json()
),
transports: [
new winston.transports.File({ filename: 'logs/error.log', level: 'error' }),
new winston.transports.File({ filename: 'logs/combined.log' }),
new winston.transports.Console({
format: winston.format.simple()
})
]
});
class AIClientManager {
constructor(config = {}) {
this.config = {
defaultProvider: 'openai',
timeout: 30000,
maxRetries: 3,
retryDelay: 1000,
enableCache: true,
cacheExpiry: 3600000, // 1小时
...config
};
this.clients = new Map();
this.cache = new Map();
this.requestStats = {
total: 0,
success: 0,
errors: 0,
cacheHits: 0
};
this.initializeClients();
this.setupCacheCleanup();
}
initializeClients() {
console.log('🔧 初始化AI客户端...');
try {
// OpenAI客户端
if (process.env.OPENAI_API_KEY) {
this.clients.set('openai', new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
timeout: this.config.timeout,
maxRetries: this.config.maxRetries
}));
logger.info('OpenAI客户端初始化成功');
}
// Anthropic客户端
if (process.env.ANTHROPIC_API_KEY) {
this.clients.set('anthropic', new Anthropic({
apiKey: process.env.ANTHROPIC_API_KEY,
timeout: this.config.timeout,
maxRetries: this.config.maxRetries
}));
logger.info('Anthropic客户端初始化成功');
}
// Google AI客户端
if (process.env.GOOGLE_AI_API_KEY) {
this.clients.set('google', new GoogleGenerativeAI(
process.env.GOOGLE_AI_API_KEY
));
logger.info('Google AI客户端初始化成功');
}
console.log(`✅ 已初始化 ${this.clients.size} 个AI客户端`);
} catch (error) {
logger.error('AI客户端初始化失败:', error);
throw error;
}
}
async generateText(prompt, options = {}) {
const startTime = Date.now();
this.requestStats.total++;
try {
const {
provider = this.config.defaultProvider,
model = 'gpt-3.5-turbo',
maxTokens = 1000,
temperature = 0.7,
useCache = this.config.enableCache,
...otherOptions
} = options;
// 检查缓存
const cacheKey = this.generateCacheKey(prompt, options);
if (useCache && this.cache.has(cacheKey)) {
const cached = this.cache.get(cacheKey);
if (Date.now() - cached.timestamp < this.config.cacheExpiry) {
this.requestStats.cacheHits++;
logger.info(`缓存命中: ${cacheKey}`);
return {
...cached.result,
fromCache: true,
responseTime: Date.now() - startTime
};
} else {
this.cache.delete(cacheKey);
}
}
console.log(`\n🤖 调用${provider}生成文本...`);
console.log(`模型: ${model}`);
console.log(`提示长度: ${prompt.length} 字符`);
let result;
switch (provider) {
case 'openai':
result = await this.callOpenAI(prompt, { model, maxTokens, temperature, ...otherOptions });
break;
case 'anthropic':
result = await this.callAnthropic(prompt, { model, maxTokens, temperature, ...otherOptions });
break;
case 'google':
result = await this.callGoogle(prompt, { model, maxTokens, temperature, ...otherOptions });
break;
default:
throw new Error(`不支持的AI提供商: ${provider}`);
}
// 缓存结果
if (useCache) {
this.cache.set(cacheKey, {
result,
timestamp: Date.now()
});
}
this.requestStats.success++;
const responseTime = Date.now() - startTime;
logger.info(`AI请求成功 - 提供商: ${provider}, 响应时间: ${responseTime}ms`);
return {
...result,
provider,
model,
responseTime,
fromCache: false
};
} catch (error) {
this.requestStats.errors++;
logger.error('AI文本生成失败:', {
error: error.message,
provider: options.provider,
promptLength: prompt.length
});
throw new Error(`AI文本生成失败: ${error.message}`);
}
}
async callOpenAI(prompt, options) {
const client = this.clients.get('openai');
if (!client) throw new Error('OpenAI客户端未初始化');
const { model, maxTokens, temperature, stream = false } = options;
const response = await client.chat.completions.create({
model,
messages: [{ role: 'user', content: prompt }],
max_tokens: maxTokens,
temperature,
stream
});
if (stream) {
return { stream: response };
}
return {
text: response.choices[0].message.content,
usage: response.usage,
finishReason: response.choices[0].finish_reason
};
}
async callAnthropic(prompt, options) {
const client = this.clients.get('anthropic');
if (!client) throw new Error('Anthropic客户端未初始化');
const { model = 'claude-3-sonnet-20240229', maxTokens, temperature } = options;
const response = await client.messages.create({
model,
max_tokens: maxTokens,
temperature,
messages: [{ role: 'user', content: prompt }]
});
return {
text: response.content[0].text,
usage: response.usage,
stopReason: response.stop_reason
};
}
async callGoogle(prompt, options) {
const client = this.clients.get('google');
if (!client) throw new Error('Google AI客户端未初始化');
const { model = 'gemini-pro', temperature } = options;
const genModel = client.getGenerativeModel({ model });
const result = await genModel.generateContent({
contents: [{ parts: [{ text: prompt }] }],
generationConfig: {
temperature,
maxOutputTokens: options.maxTokens
}
});
const response = await result.response;
return {
text: response.text(),
usage: response.usageMetadata,
finishReason: response.candidates[0].finishReason
};
}
async generateImage(prompt, options = {}) {
const startTime = Date.now();
try {
const {
provider = 'openai',
model = 'dall-e-3',
size = '1024x1024',
quality = 'standard',
style = 'vivid',
n = 1
} = options;
console.log(`\n🎨 调用${provider}生成图像...`);
console.log(`模型: ${model}, 尺寸: ${size}`);
let result;
switch (provider) {
case 'openai':
const client = this.clients.get('openai');
if (!client) throw new Error('OpenAI客户端未初始化');
const response = await client.images.generate({
model,
prompt,
n,
size,
quality,
style
});
result = {
images: response.data.map(img => ({
url: img.url,
revisedPrompt: img.revised_prompt
})),
usage: response.usage
};
break;
default:
throw new Error(`不支持的图像生成提供商: ${provider}`);
}
const responseTime = Date.now() - startTime;
logger.info(`图像生成成功 - 提供商: ${provider}, 响应时间: ${responseTime}ms`);
return {
...result,
provider,
model,
responseTime
};
} catch (error) {
logger.error('图像生成失败:', error);
throw new Error(`图像生成失败: ${error.message}`);
}
}
async analyzeImage(imageUrl, prompt, options = {}) {
const startTime = Date.now();
try {
const {
provider = 'openai',
model = 'gpt-4-vision-preview',
maxTokens = 1000
} = options;
console.log(`\n👁️ 调用${provider}分析图像...`);
let result;
switch (provider) {
case 'openai':
const client = this.clients.get('openai');
if (!client) throw new Error('OpenAI客户端未初始化');
const response = await client.chat.completions.create({
model,
messages: [
{
role: 'user',
content: [
{ type: 'text', text: prompt },
{ type: 'image_url', image_url: { url: imageUrl } }
]
}
],
max_tokens: maxTokens
});
result = {
analysis: response.choices[0].message.content,
usage: response.usage
};
break;
default:
throw new Error(`不支持的图像分析提供商: ${provider}`);
}
const responseTime = Date.now() - startTime;
logger.info(`图像分析成功 - 响应时间: ${responseTime}ms`);
return {
...result,
provider,
model,
responseTime
};
} catch (error) {
logger.error('图像分析失败:', error);
throw new Error(`图像分析失败: ${error.message}`);
}
}
generateCacheKey(prompt, options) {
const keyData = {
prompt: prompt.substring(0, 100), // 只取前100字符
provider: options.provider,
model: options.model,
temperature: options.temperature,
maxTokens: options.maxTokens
};
return Buffer.from(JSON.stringify(keyData)).toString('base64');
}
setupCacheCleanup() {
// 每小时清理过期缓存
setInterval(() => {
const now = Date.now();
let cleanedCount = 0;
for (const [key, value] of this.cache.entries()) {
if (now - value.timestamp > this.config.cacheExpiry) {
this.cache.delete(key);
cleanedCount++;
}
}
if (cleanedCount > 0) {
logger.info(`清理了 ${cleanedCount} 个过期缓存项`);
}
}, 3600000); // 1小时
}
getStats() {
return {
...this.requestStats,
cacheSize: this.cache.size,
availableProviders: Array.from(this.clients.keys()),
cacheHitRate: this.requestStats.total > 0 ?
(this.requestStats.cacheHits / this.requestStats.total * 100).toFixed(2) + '%' : '0%'
};
}
clearCache() {
this.cache.clear();
logger.info('缓存已清空');
}
async healthCheck() {
const results = {};
for (const [provider, client] of this.clients.entries()) {
try {
const startTime = Date.now();
// 发送简单的健康检查请求
await this.generateText('Hello', {
provider,
maxTokens: 10,
useCache: false
});
results[provider] = {
status: 'healthy',
responseTime: Date.now() - startTime
};
} catch (error) {
results[provider] = {
status: 'unhealthy',
error: error.message
};
}
}
return results;
}
exportStats(format = 'json') {
const stats = {
timestamp: new Date().toISOString(),
...this.getStats(),
cacheEntries: Array.from(this.cache.entries()).map(([key, value]) => ({
key,
timestamp: new Date(value.timestamp).toISOString(),
age: Date.now() - value.timestamp
}))
};
const filename = `ai_client_stats_${Date.now()}.${format}`;
if (format === 'json') {
fs.writeFileSync(filename, JSON.stringify(stats, null, 2));
}
console.log(`📊 统计数据已导出到: ${filename}`);
return filename;
}
}
// 使用示例
const aiManager = new AIClientManager({
defaultProvider: 'openai',
enableCache: true,
timeout: 30000
});
// 文本生成示例
async function textGenerationDemo() {
console.log('🚀 AI文本生成演示\n');
try {
// OpenAI GPT示例
const gptResult = await aiManager.generateText(
'请用Node.js编写一个简单的HTTP服务器',
{
provider: 'openai',
model: 'gpt-3.5-turbo',
maxTokens: 500,
temperature: 0.7
}
);
console.log('GPT-3.5 回复:');
console.log(gptResult.text);
console.log(`响应时间: ${gptResult.responseTime}ms\n`);
// Claude示例
if (process.env.ANTHROPIC_API_KEY) {
const claudeResult = await aiManager.generateText(
'解释一下什么是RESTful API',
{
provider: 'anthropic',
model: 'claude-3-sonnet-20240229',
maxTokens: 300,
temperature: 0.5
}
);
console.log('Claude 回复:');
console.log(claudeResult.text);
console.log(`响应时间: ${claudeResult.responseTime}ms\n`);
}
// 显示统计信息
const stats = aiManager.getStats();
console.log('📊 AI客户端统计:');
console.log(`总请求数: ${stats.total}`);
console.log(`成功请求: ${stats.success}`);
console.log(`缓存命中率: ${stats.cacheHitRate}`);
console.log(`可用提供商: ${stats.availableProviders.join(', ')}`);
} catch (error) {
console.error('演示失败:', error.message);
}
}
// 图像生成示例
async function imageGenerationDemo() {
console.log('\n🎨 AI图像生成演示\n');
try {
const imageResult = await aiManager.generateImage(
'A beautiful sunset over a mountain landscape, digital art style',
{
provider: 'openai',
model: 'dall-e-3',
size: '1024x1024',
quality: 'standard'
}
);
console.log('图像生成成功:');
console.log(`图像URL: ${imageResult.images[0].url}`);
console.log(`修订提示: ${imageResult.images[0].revisedPrompt}`);
console.log(`响应时间: ${imageResult.responseTime}ms`);
} catch (error) {
console.error('图像生成失败:', error.message);
}
}
// 健康检查示例
async function healthCheckDemo() {
console.log('\n🏥 AI服务健康检查\n');
const healthStatus = await aiManager.healthCheck();
Object.entries(healthStatus).forEach(([provider, status]) => {
console.log(`${provider}: ${status.status}`);
if (status.responseTime) {
console.log(` 响应时间: ${status.responseTime}ms`);
}
if (status.error) {
console.log(` 错误: ${status.error}`);
}
});
}
// 运行演示
if (require.main === module) {
(async () => {
await textGenerationDemo();
await imageGenerationDemo();
await healthCheckDemo();
// 导出统计数据
aiManager.exportStats();
})().catch(console.error);
}
module.exports = { AIClientManager };
🔄 流式响应处理
实时流式文本生成
const EventEmitter = require('events');
class StreamingAIClient extends EventEmitter {
constructor(aiManager) {
super();
this.aiManager = aiManager;
}
async streamText(prompt, options = {}) {
const {
provider = 'openai',
model = 'gpt-3.5-turbo',
onChunk = null,
onComplete = null,
onError = null
} = options;
console.log(`\n🌊 开始流式文本生成...`);
try {
const result = await this.aiManager.generateText(prompt, {
...options,
stream: true
});
let fullText = '';
const chunks = [];
for await (const chunk of result.stream) {
const content = chunk.choices[0]?.delta?.content || '';
if (content) {
fullText += content;
chunks.push({
content,
timestamp: Date.now(),
index: chunks.length
});
// 触发chunk事件
this.emit('chunk', {
content,
fullText,
chunkIndex: chunks.length - 1
});
// 调用回调函数
if (onChunk) {
onChunk(content, fullText);
}
// 实时输出
process.stdout.write(content);
}
// 检查是否完成
if (chunk.choices[0]?.finish_reason) {
console.log('\n\n✅ 流式生成完成');
const completeResult = {
fullText,
chunks,
finishReason: chunk.choices[0].finish_reason,
totalChunks: chunks.length
};
this.emit('complete', completeResult);
if (onComplete) {
onComplete(completeResult);
}
return completeResult;
}
}
} catch (error) {
console.error('\n❌ 流式生成失败:', error.message);
this.emit('error', error);
if (onError) {
onError(error);
}
throw error;
}
}
async streamToFile(prompt, filePath, options = {}) {
const fs = require('fs');
const writeStream = fs.createWriteStream(filePath);
console.log(`\n📝 流式写入文件: ${filePath}`);
try {
await this.streamText(prompt, {
...options,
onChunk: (content) => {
writeStream.write(content);
},
onComplete: () => {
writeStream.end();
console.log(`✅ 文件写入完成: ${filePath}`);
},
onError: (error) => {
writeStream.destroy();
console.error(`❌ 文件写入失败: ${error.message}`);
}
});
} catch (error) {
writeStream.destroy();
throw error;
}
}
}
// 使用示例
async function streamingDemo() {
const streamClient = new StreamingAIClient(aiManager);
// 监听事件
streamClient.on('chunk', (data) => {
// 可以在这里处理每个chunk
console.log(`\n[Chunk ${data.chunkIndex}] 长度: ${data.content.length}`);
});
streamClient.on('complete', (result) => {
console.log(`\n📊 生成统计:`);
console.log(`总长度: ${result.fullText.length} 字符`);
console.log(`总块数: ${result.totalChunks}`);
console.log(`完成原因: ${result.finishReason}`);
});
try {
await streamClient.streamText(
'请详细解释Node.js的事件循环机制,包括各个阶段的作用',
{
provider: 'openai',
model: 'gpt-3.5-turbo',
maxTokens: 1000,
temperature: 0.7
}
);
} catch (error) {
console.error('流式演示失败:', error.message);
}
}
📊 数据处理与预处理
文档处理工具
const fs = require('fs');
const path = require('path');
const pdfParse = require('pdf-parse');
const sharp = require('sharp');
class DocumentProcessor {
constructor() {
this.supportedFormats = {
text: ['.txt', '.md', '.json'],
pdf: ['.pdf'],
image: ['.jpg', '.jpeg', '.png', '.webp', '.gif'],
office: ['.docx', '.xlsx', '.pptx']
};
}
async processDocument(filePath, options = {}) {
const ext = path.extname(filePath).toLowerCase();
const stats = fs.statSync(filePath);
console.log(`\n📄 处理文档: ${path.basename(filePath)}`);
console.log(`文件大小: ${(stats.size / 1024).toFixed(2)} KB`);
const result = {
filePath,
fileName: path.basename(filePath),
fileSize: stats.size,
extension: ext,
processedAt: new Date().toISOString(),
content: null,
metadata: {},
error: null
};
try {
switch (true) {
case this.supportedFormats.text.includes(ext):
result.content = await this.processTextFile(filePath, options);
break;
case this.supportedFormats.pdf.includes(ext):
result.content = await this.processPDFFile(filePath, options);
break;
case this.supportedFormats.image.includes(ext):
result.content = await this.processImageFile(filePath, options);
break;
default:
throw new Error(`不支持的文件格式: ${ext}`);
}
console.log(`✅ 文档处理完成`);
console.log(`内容长度: ${result.content?.text?.length || 0} 字符`);
} catch (error) {
result.error = error.message;
console.error(`❌ 文档处理失败: ${error.message}`);
}
return result;
}
async processTextFile(filePath, options = {}) {
const { encoding = 'utf8', maxSize = 10 * 1024 * 1024 } = options;
const content = fs.readFileSync(filePath, encoding);
if (content.length > maxSize) {
throw new Error(`文件过大: ${content.length} > ${maxSize}`);
}
return {
text: content,
wordCount: content.split(/\s+/).length,
lineCount: content.split('\n').length,
encoding
};
}
async processPDFFile(filePath, options = {}) {
const { maxPages = 100 } = options;
const dataBuffer = fs.readFileSync(filePath);
const pdfData = await pdfParse(dataBuffer);
if (pdfData.numpages > maxPages) {
throw new Error(`PDF页数过多: ${pdfData.numpages} > ${maxPages}`);
}
return {
text: pdfData.text,
pageCount: pdfData.numpages,
wordCount: pdfData.text.split(/\s+/).length,
metadata: pdfData.metadata,
info: pdfData.info
};
}
async processImageFile(filePath, options = {}) {
const {
resize = null,
format = 'jpeg',
quality = 80,
extractText = false
} = options;
const image = sharp(filePath);
const metadata = await image.metadata();
let processedImage = image;
// 调整大小
if (resize) {
processedImage = processedImage.resize(resize.width, resize.height, {
fit: 'inside',
withoutEnlargement: true
});
}
// 转换格式和质量
if (format === 'jpeg') {
processedImage = processedImage.jpeg({ quality });
} else if (format === 'png') {
processedImage = processedImage.png({ quality });
}
const buffer = await processedImage.toBuffer();
const base64 = buffer.toString('base64');
const result = {
base64: `data:image/${format};base64,${base64}`,
metadata: {
width: metadata.width,
height: metadata.height,
format: metadata.format,
size: metadata.size,
density: metadata.density
},
processedSize: buffer.length
};
// OCR文字提取(需要额外的OCR库)
if (extractText) {
// 这里可以集成Tesseract.js或其他OCR库
result.extractedText = ''; // 占位符
}
return result;
}
async batchProcess(directory, options = {}) {
const {
recursive = false,
filePattern = null,
maxFiles = 100,
parallel = 3
} = options;
console.log(`\n📁 批量处理目录: ${directory}`);
const files = this.findFiles(directory, { recursive, filePattern, maxFiles });
console.log(`找到 ${files.length} 个文件`);
const results = [];
const errors = [];
// 并行处理文件
for (let i = 0; i < files.length; i += parallel) {
const batch = files.slice(i, i + parallel);
const batchPromises = batch.map(async (file) => {
try {
const result = await this.processDocument(file, options);
return result;
} catch (error) {
errors.push({ file, error: error.message });
return null;
}
});
const batchResults = await Promise.all(batchPromises);
results.push(...batchResults.filter(r => r !== null));
console.log(`已处理 ${Math.min(i + parallel, files.length)}/${files.length} 个文件`);
}
return {
totalFiles: files.length,
successCount: results.length,
errorCount: errors.length,
results,
errors
};
}
findFiles(directory, options = {}) {
const { recursive = false, filePattern = null, maxFiles = 100 } = options;
const files = [];
const scanDirectory = (dir) => {
if (files.length >= maxFiles) return;
const items = fs.readdirSync(dir);
for (const item of items) {
if (files.length >= maxFiles) break;
const fullPath = path.join(dir, item);
const stat = fs.statSync(fullPath);
if (stat.isDirectory() && recursive) {
scanDirectory(fullPath);
} else if (stat.isFile()) {
const ext = path.extname(item).toLowerCase();
const isSupported = Object.values(this.supportedFormats)
.flat()
.includes(ext);
if (isSupported) {
if (!filePattern || item.match(filePattern)) {
files.push(fullPath);
}
}
}
}
};
scanDirectory(directory);
return files;
}
async createSearchIndex(documents, options = {}) {
const {
chunkSize = 1000,
overlap = 200,
minChunkSize = 100
} = options;
console.log(`\n🔍 创建搜索索引...`);
const index = {
documents: [],
chunks: [],
metadata: {
createdAt: new Date().toISOString(),
totalDocuments: documents.length,
totalChunks: 0,
options
}
};
for (const doc of documents) {
if (!doc.content?.text) continue;
const docIndex = {
id: doc.fileName,
filePath: doc.filePath,
fileSize: doc.fileSize,
processedAt: doc.processedAt,
chunkCount: 0
};
// 分块处理
const chunks = this.chunkText(doc.content.text, {
chunkSize,
overlap,
minChunkSize
});
chunks.forEach((chunk, chunkIndex) => {
index.chunks.push({
id: `${doc.fileName}_${chunkIndex}`,
documentId: doc.fileName,
chunkIndex,
text: chunk,
length: chunk.length,
wordCount: chunk.split(/\s+/).length
});
});
docIndex.chunkCount = chunks.length;
index.documents.push(docIndex);
}
index.metadata.totalChunks = index.chunks.length;
console.log(`✅ 索引创建完成`);
console.log(`文档数: ${index.documents.length}`);
console.log(`块数: ${index.chunks.length}`);
return index;
}
chunkText(text, options = {}) {
const { chunkSize = 1000, overlap = 200, minChunkSize = 100 } = options;
const chunks = [];
let start = 0;
while (start < text.length) {
let end = start + chunkSize;
// 如果不是最后一块,尝试在句子边界分割
if (end < text.length) {
const sentenceEnd = text.lastIndexOf('.', end);
const paragraphEnd = text.lastIndexOf('\n', end);
const boundary = Math.max(sentenceEnd, paragraphEnd);
if (boundary > start + minChunkSize) {
end = boundary + 1;
}
}
const chunk = text.slice(start, end).trim();
if (chunk.length >= minChunkSize) {
chunks.push(chunk);
}
start = end - overlap;
// 避免无限循环
if (start >= end) {
start = end;
}
}
return chunks;
}
async saveIndex(index, filePath) {
fs.writeFileSync(filePath, JSON.stringify(index, null, 2));
console.log(`💾 索引已保存到: ${filePath}`);
}
async loadIndex(filePath) {
const data = fs.readFileSync(filePath, 'utf8');
const index = JSON.parse(data);
console.log(`📖 索引已加载: ${index.metadata.totalDocuments} 文档, ${index.metadata.totalChunks} 块`);
return index;
}
}
// 使用示例
const docProcessor = new DocumentProcessor();
async function documentProcessingDemo() {
console.log('📚 文档处理演示\n');
try {
// 处理单个文档
const result = await docProcessor.processDocument('./example.pdf', {
maxPages: 50
});
if (result.content) {
console.log(`文档内容预览: ${result.content.text.substring(0, 200)}...`);
}
// 批量处理目录
const batchResult = await docProcessor.batchProcess('./documents', {
recursive: true,
maxFiles: 10,
parallel: 2
});
console.log(`\n批量处理结果:`);
console.log(`成功: ${batchResult.successCount}`);
console.log(`失败: ${batchResult.errorCount}`);
// 创建搜索索引
if (batchResult.results.length > 0) {
const index = await docProcessor.createSearchIndex(batchResult.results, {
chunkSize: 800,
overlap: 100
});
await docProcessor.saveIndex(index, './search_index.json');
}
} catch (error) {
console.error('文档处理演示失败:', error.message);
}
}
module.exports = { DocumentProcessor };
🎯 学习检验
理论理解检验
- 环境配置:能否正确配置Node.js AI开发环境?
- API集成:能否理解不同AI服务的API调用方式?
- 数据处理:能否实现文档的预处理和格式转换?
- 性能优化:能否理解缓存、并发等优化策略?
实践能力检验
- 客户端开发:能否开发统一的AI客户端管理器?
- 流式处理:能否实现实时的流式响应处理?
- 文档处理:能否构建完整的文档处理工具?
- 错误处理:能否实现健壮的错误处理和重试机制?
🚀 实践项目建议
基础实战项目
- AI聊天应用:构建支持多个AI模型的聊天应用
- 文档问答系统:基于文档内容的智能问答系统
- 代码生成工具:AI驱动的代码生成和优化工具
- 内容创作助手:多功能的AI内容创作工具
高级综合项目
- 企业AI平台:企业级的AI服务集成平台
- 多模态AI应用:支持文本、图像、语音的综合AI应用
- AI开发框架:可复用的Node.js AI开发框架
- 智能数据分析:AI驱动的数据分析和可视化平台
📚 延伸阅读
技术文档
- "Node.js AI Development Guide" - Node.js AI开发指南
- "OpenAI API Documentation" - OpenAI API官方文档
- "Building AI Applications with JavaScript" - JavaScript AI应用开发
- "LangChain.js Documentation" - LangChain JavaScript版本文档
开源项目
- LangChain.js - JavaScript AI应用开发框架
- Vercel AI SDK - AI应用开发工具包
- OpenAI Node.js Library - OpenAI官方Node.js库
- Hugging Face Transformers.js - 浏览器端AI模型库
💡 学习提示:Node.js AI应用开发需要掌握异步编程、API集成、数据处理等多项技能。建议从简单的API调用开始,逐步构建复杂的AI应用。重视错误处理和性能优化,关注AI服务的成本控制。实践中要注意API密钥的安全管理,合理使用缓存和批处理来提高效率。