AI应用部署与运维
📖 概述
AI应用部署与运维是确保AI系统稳定运行和持续优化的关键环节。本文档将深入探讨现代AI应用的部署策略、运维实践和监控体系,帮助开发者构建可靠的AI生产环境。
🎯 学习目标
- 掌握AI应用部署策略
- 理解运维最佳实践
- 构建监控和告警体系
🚀 部署策略
部署架构选择
单体应用部署
# 单体AI应用Dockerfile
FROM node:18-alpine AS builder
WORKDIR /app
COPY package*.json ./
RUN npm ci
COPY . .
RUN npm run build
FROM node:18-alpine AS runtime
WORKDIR /app
COPY --from=builder /app/dist ./dist
COPY --from=builder /app/node_modules ./node_modules
COPY package*.json ./
EXPOSE 3000
CMD ["npm", "start"]
微服务部署
# docker-compose.yml
version: '3.8'
services:
ai-frontend:
build: ./frontend
ports:
- "3000:3000"
environment:
- REACT_APP_API_URL=http://ai-backend:8000
depends_on:
- ai-backend
ai-backend:
build: ./backend
ports:
- "8000:8000"
environment:
- DATABASE_URL=postgresql://user:pass@db:5432/ai_app
- REDIS_URL=redis://redis:6379
depends_on:
- db
- redis
ai-model-service:
build: ./ai-services
ports:
- "5000:5000"
environment:
- MODEL_PATH=/models
- GPU_ENABLED=true
volumes:
- ./models:/models
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
db:
image: postgres:15
environment:
- POSTGRES_DB=ai_app
- POSTGRES_USER=user
- POSTGRES_PASSWORD=pass
volumes:
- postgres_data:/var/lib/postgresql/data
redis:
image: redis:7-alpine
volumes:
- redis_data:/data
volumes:
postgres_data:
redis_data:
云原生部署
Kubernetes部署
# k8s-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: ai-frontend
spec:
replicas: 3
selector:
matchLabels:
app: ai-frontend
template:
metadata:
labels:
app: ai-frontend
spec:
containers:
- name: ai-frontend
image: ai-frontend:latest
ports:
- containerPort: 3000
resources:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "256Mi"
cpu: "200m"
livenessProbe:
httpGet:
path: /health
port: 3000
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /ready
port: 3000
initialDelaySeconds: 5
periodSeconds: 5
---
apiVersion: v1
kind: Service
metadata:
name: ai-frontend-service
spec:
selector:
app: ai-frontend
ports:
- protocol: TCP
port: 80
targetPort: 3000
type: LoadBalancer
Helm Chart部署
# values.yaml
frontend:
replicaCount: 3
image:
repository: ai-frontend
tag: latest
pullPolicy: IfNotPresent
resources:
limits:
cpu: 200m
memory: 256Mi
requests:
cpu: 100m
memory: 128Mi
backend:
replicaCount: 2
image:
repository: ai-backend
tag: latest
resources:
limits:
cpu: 500m
memory: 512Mi
requests:
cpu: 250m
memory: 256Mi
aiService:
replicaCount: 1
image:
repository: ai-model-service
tag: latest
gpu:
enabled: true
count: 1
resources:
limits:
cpu: 2000m
memory: 8Gi
nvidia.com/gpu: 1
requests:
cpu: 1000m
memory: 4Gi
nvidia.com/gpu: 1
🔧 环境配置管理
环境变量管理
// config/environment.ts
interface EnvironmentConfig {
NODE_ENV: 'development' | 'staging' | 'production';
PORT: number;
DATABASE_URL: string;
REDIS_URL: string;
OPENAI_API_KEY: string;
ANTHROPIC_API_KEY: string;
JWT_SECRET: string;
CORS_ORIGIN: string;
LOG_LEVEL: string;
}
const getEnvironmentConfig = (): EnvironmentConfig => {
const requiredEnvVars = [
'DATABASE_URL',
'REDIS_URL',
'OPENAI_API_KEY',
'JWT_SECRET'
];
for (const envVar of requiredEnvVars) {
if (!process.env[envVar]) {
throw new Error(`Missing required environment variable: ${envVar}`);
}
}
return {
NODE_ENV: (process.env.NODE_ENV as any) || 'development',
PORT: parseInt(process.env.PORT || '3000', 10),
DATABASE_URL: process.env.DATABASE_URL!,
REDIS_URL: process.env.REDIS_URL!,
OPENAI_API_KEY: process.env.OPENAI_API_KEY!,
ANTHROPIC_API_KEY: process.env.ANTHROPIC_API_KEY || '',
JWT_SECRET: process.env.JWT_SECRET!,
CORS_ORIGIN: process.env.CORS_ORIGIN || 'http://localhost:3000',
LOG_LEVEL: process.env.LOG_LEVEL || 'info'
};
};
export const config = getEnvironmentConfig();
配置验证
// config/validation.ts
import Joi from 'joi';
const envSchema = Joi.object({
NODE_ENV: Joi.string().valid('development', 'staging', 'production').required(),
PORT: Joi.number().port().default(3000),
DATABASE_URL: Joi.string().uri().required(),
REDIS_URL: Joi.string().uri().required(),
OPENAI_API_KEY: Joi.string().min(1).required(),
ANTHROPIC_API_KEY: Joi.string().min(1).optional(),
JWT_SECRET: Joi.string().min(32).required(),
CORS_ORIGIN: Joi.string().uri().default('http://localhost:3000'),
LOG_LEVEL: Joi.string().valid('error', 'warn', 'info', 'debug').default('info')
});
export const validateEnvironment = () => {
const { error, value } = envSchema.validate(process.env, {
allowUnknown: true,
stripUnknown: true
});
if (error) {
throw new Error(`Environment validation failed: ${error.message}`);
}
return value;
};
📊 监控和日志
应用监控
// monitoring/app-monitor.ts
import { register, collectDefaultMetrics, Counter, Histogram, Gauge } from 'prom-client';
class ApplicationMonitor {
private requestCounter: Counter;
private requestDuration: Histogram;
private activeConnections: Gauge;
private aiRequestCounter: Counter;
private aiResponseTime: Histogram;
constructor() {
// 注册默认指标
collectDefaultMetrics();
// 自定义指标
this.requestCounter = new Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status']
});
this.requestDuration = new Histogram({
name: 'http_request_duration_seconds',
help: 'HTTP request duration in seconds',
labelNames: ['method', 'route']
});
this.activeConnections = new Gauge({
name: 'active_connections',
help: 'Number of active connections'
});
this.aiRequestCounter = new Counter({
name: 'ai_requests_total',
help: 'Total number of AI API requests',
labelNames: ['service', 'model', 'status']
});
this.aiResponseTime = new Histogram({
name: 'ai_response_time_seconds',
help: 'AI API response time in seconds',
labelNames: ['service', 'model']
});
}
recordRequest(method: string, route: string, status: number, duration: number) {
this.requestCounter.inc({ method, route, status });
this.requestDuration.observe({ method, route }, duration);
}
recordAIRequest(service: string, model: string, status: string, duration: number) {
this.aiRequestCounter.inc({ service, model, status });
this.aiResponseTime.observe({ service, model }, duration);
}
setActiveConnections(count: number) {
this.activeConnections.set(count);
}
getMetrics() {
return register.metrics();
}
}
export const appMonitor = new ApplicationMonitor();
日志管理
// logging/logger.ts
import winston from 'winston';
import { ElasticsearchTransport } from 'winston-elasticsearch';
class Logger {
private logger: winston.Logger;
constructor() {
const transports: winston.transport[] = [
new winston.transports.Console({
format: winston.format.combine(
winston.format.timestamp(),
winston.format.colorize(),
winston.format.printf(({ timestamp, level, message, ...meta }) => {
return `${timestamp} [${level}]: ${message} ${
Object.keys(meta).length ? JSON.stringify(meta, null, 2) : ''
}`;
})
)
})
];
// 生产环境添加Elasticsearch传输
if (process.env.NODE_ENV === 'production') {
transports.push(
new ElasticsearchTransport({
level: 'info',
clientOpts: {
node: process.env.ELASTICSEARCH_URL || 'http://localhost:9200',
index: 'ai-app-logs'
}
})
);
}
this.logger = winston.createLogger({
level: process.env.LOG_LEVEL || 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json()
),
transports
});
}
info(message: string, meta?: any) {
this.logger.info(message, meta);
}
warn(message: string, meta?: any) {
this.logger.warn(message, meta);
}
error(message: string, meta?: any) {
this.logger.error(message, meta);
}
debug(message: string, meta?: any) {
this.logger.debug(message, meta);
}
}
export const logger = new Logger();
健康检查
// health/health-check.ts
import { HealthCheck, HealthCheckResult } from '@nestjs/terminus';
class HealthChecker {
private checks: Map<string, () => Promise<boolean>> = new Map();
constructor() {
this.registerDefaultChecks();
}
private registerDefaultChecks() {
// 数据库连接检查
this.registerCheck('database', async () => {
try {
// 执行简单查询
await this.checkDatabaseConnection();
return true;
} catch (error) {
logger.error('Database health check failed', { error });
return false;
}
});
// Redis连接检查
this.registerCheck('redis', async () => {
try {
await this.checkRedisConnection();
return true;
} catch (error) {
logger.error('Redis health check failed', { error });
return false;
}
});
// AI服务检查
this.registerCheck('ai-service', async () => {
try {
await this.checkAIService();
return true;
} catch (error) {
logger.error('AI service health check failed', { error });
return false;
}
});
}
registerCheck(name: string, check: () => Promise<boolean>) {
this.checks.set(name, check);
}
async runHealthChecks(): Promise<HealthCheckResult> {
const results = new Map<string, boolean>();
const startTime = Date.now();
for (const [name, check] of this.checks) {
try {
const result = await check();
results.set(name, result);
} catch (error) {
logger.error(`Health check failed for ${name}`, { error });
results.set(name, false);
}
}
const duration = Date.now() - startTime;
const isHealthy = Array.from(results.values()).every(result => result);
return {
status: isHealthy ? 'healthy' : 'unhealthy',
checks: Object.fromEntries(results),
timestamp: new Date().toISOString(),
duration: `${duration}ms`
};
}
private async checkDatabaseConnection() {
// 实现数据库连接检查
}
private async checkRedisConnection() {
// 实现Redis连接检查
}
private async checkAIService() {
// 实现AI服务检查
}
}
export const healthChecker = new HealthChecker();
🚨 告警和通知
告警规则
# prometheus/alert-rules.yml
groups:
- name: ai-app-alerts
rules:
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
for: 2m
labels:
severity: warning
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value }} errors per second"
- alert: HighResponseTime
expr: histogram_quantile(0.95, http_request_duration_seconds) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "High response time detected"
description: "95th percentile response time is {{ $value }} seconds"
- alert: AIServiceDown
expr: up{job="ai-service"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "AI service is down"
description: "AI service has been down for more than 1 minute"
- alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage"
description: "Memory usage is {{ $value | humanizePercentage }}"
通知配置
# alertmanager/config.yml
global:
smtp_smarthost: 'smtp.gmail.com:587'
smtp_from: 'alerts@yourcompany.com'
smtp_auth_username: 'your-email@gmail.com'
smtp_auth_password: 'your-app-password'
route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: 'team-ai'
receivers:
- name: 'team-ai'
email_configs:
- to: 'ai-team@yourcompany.com'
send_resolved: true
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
channel: '#ai-alerts'
send_resolved: true
title: '{{ template "slack.title" . }}'
text: '{{ template "slack.text" . }}'
🔄 CI/CD流水线
GitHub Actions配置
# .github/workflows/deploy.yml
name: Deploy AI Application
on:
push:
branches: [main, develop]
pull_request:
branches: [main]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Setup Node.js
uses: actions/setup-node@v3
with:
node-version: '18'
cache: 'npm'
- name: Install dependencies
run: npm ci
- name: Run tests
run: npm test
- name: Run linting
run: npm run lint
- name: Build application
run: npm run build
deploy-staging:
needs: test
runs-on: ubuntu-latest
if: github.ref == 'refs/heads/develop'
environment: staging
steps:
- uses: actions/checkout@v3
- name: Deploy to staging
run: |
echo "Deploying to staging environment"
# 部署到测试环境的命令
deploy-production:
needs: test
runs-on: ubuntu-latest
if: github.ref == 'refs/heads/main'
environment: production
steps:
- uses: actions/checkout@v3
- name: Deploy to production
run: |
echo "Deploying to production environment"
# 部署到生产环境的命令
自动化测试
// tests/integration/ai-service.test.ts
import request from 'supertest';
import { app } from '../../src/app';
import { setupTestDatabase, teardownTestDatabase } from '../utils/test-db';
describe('AI Service Integration Tests', () => {
beforeAll(async () => {
await setupTestDatabase();
});
afterAll(async () => {
await teardownTestDatabase();
});
describe('POST /api/ai/generate-text', () => {
it('should generate text successfully', async () => {
const response = await request(app)
.post('/api/ai/generate-text')
.send({
prompt: 'Hello, how are you?',
maxTokens: 100
})
.expect(200);
expect(response.body).toHaveProperty('text');
expect(response.body.text).toBeTruthy();
});
it('should handle invalid prompts', async () => {
const response = await request(app)
.post('/api/ai/generate-text')
.send({
prompt: '',
maxTokens: 100
})
.expect(400);
expect(response.body).toHaveProperty('error');
});
});
describe('GET /api/health', () => {
it('should return healthy status', async () => {
const response = await request(app)
.get('/api/health')
.expect(200);
expect(response.body.status).toBe('healthy');
});
});
});
📈 性能监控
性能指标收集
// monitoring/performance-monitor.ts
class PerformanceMonitor {
private metrics: Map<string, number[]> = new Map();
recordMetric(name: string, value: number) {
if (!this.metrics.has(name)) {
this.metrics.set(name, []);
}
this.metrics.get(name)!.push(value);
}
getMetrics(name: string) {
const values = this.metrics.get(name) || [];
if (values.length === 0) return null;
const sorted = values.sort((a, b) => a - b);
return {
count: values.length,
min: sorted[0],
max: sorted[sorted.length - 1],
avg: values.reduce((a, b) => a + b, 0) / values.length,
p95: sorted[Math.floor(sorted.length * 0.95)],
p99: sorted[Math.floor(sorted.length * 0.99)]
};
}
generateReport() {
const report: Record<string, any> = {};
for (const [name, values] of this.metrics) {
report[name] = this.getMetrics(name);
}
return report;
}
}
export const performanceMonitor = new PerformanceMonitor();
资源使用监控
// monitoring/resource-monitor.ts
import os from 'os';
class ResourceMonitor {
getSystemMetrics() {
const totalMem = os.totalmem();
const freeMem = os.freemem();
const usedMem = totalMem - freeMem;
return {
cpu: {
loadAverage: os.loadavg(),
cores: os.cpus().length
},
memory: {
total: totalMem,
used: usedMem,
free: freeMem,
usage: (usedMem / totalMem) * 100
},
uptime: os.uptime(),
platform: os.platform(),
arch: os.arch()
};
}
getProcessMetrics() {
const usage = process.memoryUsage();
return {
rss: usage.rss,
heapTotal: usage.heapTotal,
heapUsed: usage.heapUsed,
external: usage.external,
arrayBuffers: usage.arrayBuffers
};
}
}
export const resourceMonitor = new ResourceMonitor();
🔒 安全配置
安全中间件
// middleware/security.ts
import helmet from 'helmet';
import rateLimit from 'express-rate-limit';
import cors from 'cors';
export const securityMiddleware = [
// 安全头设置
helmet({
contentSecurityPolicy: {
directives: {
defaultSrc: ["'self'"],
styleSrc: ["'self'", "'unsafe-inline'"],
scriptSrc: ["'self'"],
imgSrc: ["'self'", "data:", "https:"],
connectSrc: ["'self'", "https://api.openai.com", "https://api.anthropic.com"]
}
}
}),
// CORS配置
cors({
origin: process.env.CORS_ORIGIN?.split(',') || ['http://localhost:3000'],
credentials: true,
methods: ['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'],
allowedHeaders: ['Content-Type', 'Authorization']
}),
// 速率限制
rateLimit({
windowMs: 15 * 60 * 1000, // 15分钟
max: 100, // 限制每个IP 15分钟内最多100个请求
message: 'Too many requests from this IP, please try again later.',
standardHeaders: true,
legacyHeaders: false
}),
// AI API特定限制
rateLimit({
windowMs: 60 * 1000, // 1分钟
max: 10, // 限制每个IP 1分钟内最多10个AI请求
message: 'Too many AI requests, please try again later.',
standardHeaders: true,
legacyHeaders: false,
skip: (req) => !req.path.includes('/api/ai/')
})
];
环境隔离
// config/environment-isolation.ts
class EnvironmentIsolation {
private static instance: EnvironmentIsolation;
private environments: Map<string, any> = new Map();
static getInstance(): EnvironmentIsolation {
if (!EnvironmentIsolation.instance) {
EnvironmentIsolation.instance = new EnvironmentIsolation();
}
return EnvironmentIsolation.instance;
}
setEnvironment(name: string, config: any) {
this.environments.set(name, config);
}
getEnvironment(name: string) {
return this.environments.get(name);
}
validateEnvironment(name: string): boolean {
const config = this.environments.get(name);
if (!config) return false;
// 验证必要的配置项
const requiredKeys = ['DATABASE_URL', 'API_KEYS', 'SECURITY_CONFIG'];
return requiredKeys.every(key => config.hasOwnProperty(key));
}
getIsolatedConfig(environment: string) {
const baseConfig = this.environments.get(environment);
if (!baseConfig) {
throw new Error(`Environment ${environment} not found`);
}
// 返回隔离的配置副本
return JSON.parse(JSON.stringify(baseConfig));
}
}
export const environmentIsolation = EnvironmentIsolation.getInstance();
📚 学习资源
部署和运维
- 《Docker实战》
- 《Kubernetes权威指南》
- 《DevOps实践指南》
监控和可观测性
- 《可观测性工程》
- 《SRE: Google运维解密》
- 《监控的艺术》
在线资源
- Kubernetes官方文档
- Prometheus监控指南
- ELK Stack最佳实践
🎯 下一步学习
完成本文档的学习后,建议继续学习:
-
高级部署策略
- 蓝绿部署
- 金丝雀发布
- 滚动更新
-
云原生运维
- 服务网格
- 混沌工程
- GitOps实践
-
AI运维特定
- 模型版本管理
- A/B测试部署
- 模型性能监控
让我们一起构建可靠的AI生产环境! 🚀