跳到主要内容

微服务与云平台部署

概述

云原生部署是现代应用程序开发和部署的核心方法,它结合了微服务架构、容器化、服务网格和云平台的优势,提供了高度可扩展、弹性和可维护的解决方案。

微服务架构设计

架构原则

# microservices-principles.yml
architecture_principles:
single_responsibility:
description: "每个服务只负责一个业务功能"
benefits:
- "高内聚,低耦合"
- "独立开发和部署"
- "技术栈灵活性"

decentralized_governance:
description: "去中心化的治理模式"
practices:
- "数据库分离"
- "独立的CI/CD流水线"
- "团队自主决策"

failure_isolation:
description: "故障隔离和容错设计"
patterns:
- "断路器模式"
- "舱壁模式"
- "超时和重试"

evolutionary_design:
description: "演进式设计"
approaches:
- "API版本控制"
- "向后兼容"
- "渐进式迁移"

服务拆分策略

微服务通信模式

同步通信

// api-gateway.js
const express = require('express');
const httpProxy = require('http-proxy-middleware');
const rateLimit = require('express-rate-limit');
const helmet = require('helmet');
const cors = require('cors');

const app = express();

// 安全中间件
app.use(helmet());
app.use(cors());

// 限流配置
const limiter = rateLimit({
windowMs: 15 * 60 * 1000, // 15分钟
max: 100, // 限制每个IP 100次请求
message: 'Too many requests from this IP'
});
app.use(limiter);

// 服务发现配置
const services = {
user: {
target: process.env.USER_SERVICE_URL || 'http://user-service:3001',
changeOrigin: true,
pathRewrite: { '^/api/users': '' }
},
order: {
target: process.env.ORDER_SERVICE_URL || 'http://order-service:3002',
changeOrigin: true,
pathRewrite: { '^/api/orders': '' }
},
payment: {
target: process.env.PAYMENT_SERVICE_URL || 'http://payment-service:3003',
changeOrigin: true,
pathRewrite: { '^/api/payments': '' }
}
};

// 健康检查
app.get('/health', (req, res) => {
res.json({ status: 'healthy', timestamp: new Date().toISOString() });
});

// 路由配置
Object.keys(services).forEach(service => {
const config = services[service];

// 添加请求日志
const proxyOptions = {
...config,
onProxyReq: (proxyReq, req, res) => {
console.log(`[${new Date().toISOString()}] ${req.method} ${req.url} -> ${config.target}`);
},
onError: (err, req, res) => {
console.error(`Proxy error for ${service}:`, err.message);
res.status(503).json({ error: 'Service temporarily unavailable' });
}
};

app.use(`/api/${service}s`, httpProxy(proxyOptions));
});

// 错误处理
app.use((err, req, res, next) => {
console.error('Gateway error:', err);
res.status(500).json({ error: 'Internal server error' });
});

const PORT = process.env.PORT || 3000;
app.listen(PORT, () => {
console.log(`API Gateway running on port ${PORT}`);
});

异步通信

// event-bus.js
const amqp = require('amqplib');
const EventEmitter = require('events');

class EventBus extends EventEmitter {
constructor() {
super();
this.connection = null;
this.channel = null;
this.exchanges = new Map();
}

async connect(url = process.env.RABBITMQ_URL || 'amqp://localhost') {
try {
this.connection = await amqp.connect(url);
this.channel = await this.connection.createChannel();

// 连接错误处理
this.connection.on('error', (err) => {
console.error('RabbitMQ connection error:', err);
this.emit('error', err);
});

this.connection.on('close', () => {
console.log('RabbitMQ connection closed');
this.emit('close');
});

console.log('Connected to RabbitMQ');
return this;
} catch (error) {
console.error('Failed to connect to RabbitMQ:', error);
throw error;
}
}

async createExchange(name, type = 'topic', options = { durable: true }) {
await this.channel.assertExchange(name, type, options);
this.exchanges.set(name, { type, options });
return this;
}

async publish(exchange, routingKey, message, options = {}) {
const messageBuffer = Buffer.from(JSON.stringify({
...message,
timestamp: new Date().toISOString(),
messageId: this.generateMessageId()
}));

const publishOptions = {
persistent: true,
contentType: 'application/json',
...options
};

return this.channel.publish(exchange, routingKey, messageBuffer, publishOptions);
}

async subscribe(exchange, routingKey, handler, options = {}) {
const queueOptions = {
durable: true,
exclusive: false,
autoDelete: false,
...options.queue
};

const { queue } = await this.channel.assertQueue('', queueOptions);
await this.channel.bindQueue(queue, exchange, routingKey);

const consumeOptions = {
noAck: false,
...options.consume
};

await this.channel.consume(queue, async (msg) => {
if (msg) {
try {
const content = JSON.parse(msg.content.toString());
await handler(content, msg);
this.channel.ack(msg);
} catch (error) {
console.error('Message processing error:', error);

// 重试逻辑
const retryCount = (msg.properties.headers['x-retry-count'] || 0) + 1;
const maxRetries = options.maxRetries || 3;

if (retryCount <= maxRetries) {
setTimeout(() => {
this.channel.publish(exchange, routingKey, msg.content, {
...msg.properties,
headers: {
...msg.properties.headers,
'x-retry-count': retryCount
}
});
}, Math.pow(2, retryCount) * 1000); // 指数退避
} else {
// 发送到死信队列
await this.publishToDeadLetter(msg, error);
}

this.channel.ack(msg);
}
}
}, consumeOptions);

return queue;
}

async publishToDeadLetter(originalMsg, error) {
const deadLetterExchange = 'dead-letter';
await this.createExchange(deadLetterExchange);

const deadLetterMessage = {
originalMessage: JSON.parse(originalMsg.content.toString()),
error: error.message,
timestamp: new Date().toISOString(),
routingKey: originalMsg.fields.routingKey
};

await this.publish(deadLetterExchange, 'failed', deadLetterMessage);
}

generateMessageId() {
return `${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
}

async close() {
if (this.channel) {
await this.channel.close();
}
if (this.connection) {
await this.connection.close();
}
}
}

// 使用示例
const eventBus = new EventBus();

// 订单服务发布事件
class OrderService {
constructor(eventBus) {
this.eventBus = eventBus;
}

async createOrder(orderData) {
// 创建订单逻辑
const order = await this.saveOrder(orderData);

// 发布订单创建事件
await this.eventBus.publish('orders', 'order.created', {
orderId: order.id,
userId: order.userId,
amount: order.amount,
items: order.items
});

return order;
}

async saveOrder(orderData) {
// 模拟保存订单
return {
id: Date.now(),
...orderData,
status: 'pending',
createdAt: new Date()
};
}
}

// 库存服务订阅事件
class InventoryService {
constructor(eventBus) {
this.eventBus = eventBus;
this.setupEventHandlers();
}

async setupEventHandlers() {
await this.eventBus.subscribe('orders', 'order.created',
this.handleOrderCreated.bind(this),
{ maxRetries: 3 }
);
}

async handleOrderCreated(event) {
console.log('Processing order created event:', event);

// 减少库存
for (const item of event.items) {
await this.reduceInventory(item.productId, item.quantity);
}

// 发布库存更新事件
await this.eventBus.publish('inventory', 'inventory.updated', {
orderId: event.orderId,
items: event.items,
status: 'reserved'
});
}

async reduceInventory(productId, quantity) {
// 模拟减少库存
console.log(`Reducing inventory for product ${productId} by ${quantity}`);
}
}

module.exports = { EventBus, OrderService, InventoryService };

服务网格 (Service Mesh)

Istio 配置

基础配置

# istio-installation.yml
apiVersion: install.istio.io/v1alpha1
kind: IstioOperator
metadata:
name: control-plane
spec:
values:
global:
meshID: mesh1
multiCluster:
clusterName: cluster1
network: network1
pilot:
env:
EXTERNAL_ISTIOD: false
components:
pilot:
k8s:
resources:
requests:
cpu: 200m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
hpaSpec:
minReplicas: 2
maxReplicas: 5
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 80
ingressGateways:
- name: istio-ingressgateway
enabled: true
k8s:
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 2000m
memory: 1024Mi
hpaSpec:
minReplicas: 2
maxReplicas: 5
service:
type: LoadBalancer
ports:
- port: 15021
targetPort: 15021
name: status-port
- port: 80
targetPort: 8080
name: http2
- port: 443
targetPort: 8443
name: https
egressGateways:
- name: istio-egressgateway
enabled: true
k8s:
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 2000m
memory: 1024Mi

流量管理

# traffic-management.yml
---
apiVersion: networking.istio.io/v1beta1
kind: Gateway
metadata:
name: microservices-gateway
namespace: production
spec:
selector:
istio: ingressgateway
servers:
- port:
number: 80
name: http
protocol: HTTP
hosts:
- api.example.com
tls:
httpsRedirect: true
- port:
number: 443
name: https
protocol: HTTPS
tls:
mode: SIMPLE
credentialName: api-tls-secret
hosts:
- api.example.com

---
apiVersion: networking.istio.io/v1beta1
kind: VirtualService
metadata:
name: microservices-vs
namespace: production
spec:
hosts:
- api.example.com
gateways:
- microservices-gateway
http:
- match:
- uri:
prefix: /api/users
route:
- destination:
host: user-service
port:
number: 3001
weight: 90
- destination:
host: user-service-v2
port:
number: 3001
weight: 10
fault:
delay:
percentage:
value: 0.1
fixedDelay: 5s
retries:
attempts: 3
perTryTimeout: 2s
timeout: 10s
- match:
- uri:
prefix: /api/orders
route:
- destination:
host: order-service
port:
number: 3002
corsPolicy:
allowOrigins:
- exact: https://app.example.com
allowMethods:
- GET
- POST
- PUT
- DELETE
allowHeaders:
- authorization
- content-type
maxAge: 24h

---
apiVersion: networking.istio.io/v1beta1
kind: DestinationRule
metadata:
name: user-service-dr
namespace: production
spec:
host: user-service
trafficPolicy:
connectionPool:
tcp:
maxConnections: 100
http:
http1MaxPendingRequests: 50
http2MaxRequests: 100
maxRequestsPerConnection: 10
maxRetries: 3
consecutiveGatewayErrors: 5
interval: 30s
baseEjectionTime: 30s
maxEjectionPercent: 50
loadBalancer:
simple: LEAST_CONN
outlierDetection:
consecutiveGatewayErrors: 3
consecutive5xxErrors: 3
interval: 30s
baseEjectionTime: 30s
maxEjectionPercent: 50
subsets:
- name: v1
labels:
version: v1
- name: v2
labels:
version: v2
trafficPolicy:
connectionPool:
tcp:
maxConnections: 50

安全策略

# security-policies.yml
---
apiVersion: security.istio.io/v1beta1
kind: PeerAuthentication
metadata:
name: default
namespace: production
spec:
mtls:
mode: STRICT

---
apiVersion: security.istio.io/v1beta1
kind: AuthorizationPolicy
metadata:
name: user-service-authz
namespace: production
spec:
selector:
matchLabels:
app: user-service
rules:
- from:
- source:
principals: ["cluster.local/ns/production/sa/api-gateway"]
- to:
- operation:
methods: ["GET"]
paths: ["/health", "/metrics"]
- from:
- source:
requestPrincipals: ["*"]
to:
- operation:
methods: ["GET", "POST", "PUT", "DELETE"]
when:
- key: request.headers[authorization]
values: ["Bearer *"]

---
apiVersion: security.istio.io/v1beta1
kind: RequestAuthentication
metadata:
name: jwt-auth
namespace: production
spec:
selector:
matchLabels:
app: user-service
jwtRules:
- issuer: "https://auth.example.com"
jwksUri: "https://auth.example.com/.well-known/jwks.json"
audiences:
- "api.example.com"
forwardOriginalToken: true

云平台部署

AWS EKS 部署

Terraform 配置

# eks-cluster.tf
terraform {
required_version = ">= 1.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "~> 2.20"
}
helm = {
source = "hashicorp/helm"
version = "~> 2.10"
}
}
}

provider "aws" {
region = var.aws_region
}

# VPC 配置
module "vpc" {
source = "terraform-aws-modules/vpc/aws"

name = "${var.cluster_name}-vpc"
cidr = var.vpc_cidr

azs = data.aws_availability_zones.available.names
private_subnets = var.private_subnets
public_subnets = var.public_subnets

enable_nat_gateway = true
single_nat_gateway = false
enable_dns_hostnames = true
enable_dns_support = true

public_subnet_tags = {
"kubernetes.io/role/elb" = "1"
}

private_subnet_tags = {
"kubernetes.io/role/internal-elb" = "1"
}

tags = {
"kubernetes.io/cluster/${var.cluster_name}" = "shared"
}
}

# EKS 集群
module "eks" {
source = "terraform-aws-modules/eks/aws"

cluster_name = var.cluster_name
cluster_version = var.kubernetes_version

vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.private_subnets
cluster_endpoint_public_access = true

# 集群加密
cluster_encryption_config = {
provider_key_arn = aws_kms_key.eks.arn
resources = ["secrets"]
}

# 集群日志
cluster_enabled_log_types = ["api", "audit", "authenticator", "controllerManager", "scheduler"]

# 节点组
eks_managed_node_groups = {
main = {
name = "main-node-group"

instance_types = ["t3.medium"]
capacity_type = "ON_DEMAND"

min_size = 2
max_size = 10
desired_size = 3

ami_type = "AL2_x86_64"

# 启动模板
launch_template_name = "main-node-group"
launch_template_version = "$Latest"

# 磁盘配置
block_device_mappings = {
xvda = {
device_name = "/dev/xvda"
ebs = {
volume_size = 50
volume_type = "gp3"
iops = 3000
throughput = 150
encrypted = true
kms_key_id = aws_kms_key.ebs.arn
delete_on_termination = true
}
}
}

# 网络配置
remote_access = {
ec2_ssh_key = var.ec2_ssh_key
source_security_group_ids = [aws_security_group.remote_access.id]
}

# 标签
tags = {
Environment = var.environment
NodeGroup = "main"
}
}

spot = {
name = "spot-node-group"

instance_types = ["t3.medium", "t3.large"]
capacity_type = "SPOT"

min_size = 0
max_size = 5
desired_size = 2

# Spot 实例配置
use_mixed_instances_policy = true
mixed_instances_policy = {
instances_distribution = {
on_demand_base_capacity = 0
on_demand_percentage_above_base_capacity = 0
spot_allocation_strategy = "capacity-optimized"
}

override = [
{
instance_type = "t3.medium"
weighted_capacity = "1"
},
{
instance_type = "t3.large"
weighted_capacity = "2"
}
]
}

# 污点配置
taints = {
spot = {
key = "spot"
value = "true"
effect = "NO_SCHEDULE"
}
}

tags = {
Environment = var.environment
NodeGroup = "spot"
}
}
}

# AWS Load Balancer Controller
enable_aws_load_balancer_controller = true

# EBS CSI Driver
enable_ebs_csi_driver = true

tags = {
Environment = var.environment
Terraform = "true"
}
}

# KMS 密钥
resource "aws_kms_key" "eks" {
description = "EKS Secret Encryption Key"
deletion_window_in_days = 7
enable_key_rotation = true

tags = {
Name = "${var.cluster_name}-eks-key"
}
}

resource "aws_kms_alias" "eks" {
name = "alias/${var.cluster_name}-eks"
target_key_id = aws_kms_key.eks.key_id
}

resource "aws_kms_key" "ebs" {
description = "EBS Encryption Key"
deletion_window_in_days = 7
enable_key_rotation = true

tags = {
Name = "${var.cluster_name}-ebs-key"
}
}

# 安全组
resource "aws_security_group" "remote_access" {
name_prefix = "${var.cluster_name}-remote-access"
vpc_id = module.vpc.vpc_id

ingress {
description = "SSH"
from_port = 22
to_port = 22
protocol = "tcp"
cidr_blocks = var.allowed_cidr_blocks
}

egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}

tags = {
Name = "${var.cluster_name}-remote-access"
}
}

# 数据源
data "aws_availability_zones" "available" {
filter {
name = "opt-in-status"
values = ["opt-in-not-required"]
}
}

data "aws_caller_identity" "current" {}

变量定义

# variables.tf
variable "aws_region" {
description = "AWS region"
type = string
default = "us-west-2"
}

variable "cluster_name" {
description = "Name of the EKS cluster"
type = string
default = "microservices-cluster"
}

variable "kubernetes_version" {
description = "Kubernetes version"
type = string
default = "1.27"
}

variable "environment" {
description = "Environment name"
type = string
default = "production"
}

variable "vpc_cidr" {
description = "CIDR block for VPC"
type = string
default = "10.0.0.0/16"
}

variable "private_subnets" {
description = "Private subnet CIDR blocks"
type = list(string)
default = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"]
}

variable "public_subnets" {
description = "Public subnet CIDR blocks"
type = list(string)
default = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"]
}

variable "ec2_ssh_key" {
description = "EC2 Key Pair name for SSH access"
type = string
default = ""
}

variable "allowed_cidr_blocks" {
description = "CIDR blocks allowed for remote access"
type = list(string)
default = ["10.0.0.0/8"]
}

Google Cloud GKE 部署

# gke-cluster.tf
terraform {
required_providers {
google = {
source = "hashicorp/google"
version = "~> 4.0"
}
}
}

provider "google" {
project = var.project_id
region = var.region
}

# VPC 网络
resource "google_compute_network" "vpc" {
name = "${var.cluster_name}-vpc"
auto_create_subnetworks = false
routing_mode = "GLOBAL"
}

# 子网
resource "google_compute_subnetwork" "subnet" {
name = "${var.cluster_name}-subnet"
ip_cidr_range = var.subnet_cidr
region = var.region
network = google_compute_network.vpc.id

secondary_ip_range {
range_name = "services-range"
ip_cidr_range = var.services_cidr
}

secondary_ip_range {
range_name = "pod-ranges"
ip_cidr_range = var.pod_cidr
}

private_ip_google_access = true
}

# GKE 集群
resource "google_container_cluster" "primary" {
name = var.cluster_name
location = var.region

# 移除默认节点池
remove_default_node_pool = true
initial_node_count = 1

# 网络配置
network = google_compute_network.vpc.name
subnetwork = google_compute_subnetwork.subnet.name

# IP 分配策略
ip_allocation_policy {
cluster_secondary_range_name = "services-range"
services_secondary_range_name = "pod-ranges"
}

# 网络策略
network_policy {
enabled = true
}

# 主版本配置
master_version = var.kubernetes_version

# 主节点授权网络
master_authorized_networks_config {
dynamic "cidr_blocks" {
for_each = var.authorized_networks
content {
cidr_block = cidr_blocks.value.cidr_block
display_name = cidr_blocks.value.display_name
}
}
}

# 私有集群配置
private_cluster_config {
enable_private_nodes = true
enable_private_endpoint = false
master_ipv4_cidr_block = var.master_cidr
}

# 工作负载身份
workload_identity_config {
workload_pool = "${var.project_id}.svc.id.goog"
}

# 集群功能
addons_config {
http_load_balancing {
disabled = false
}

horizontal_pod_autoscaling {
disabled = false
}

network_policy_config {
disabled = false
}

istio_config {
disabled = false
auth = "AUTH_MUTUAL_TLS"
}
}

# 维护策略
maintenance_policy {
recurring_window {
start_time = "2023-01-01T02:00:00Z"
end_time = "2023-01-01T06:00:00Z"
recurrence = "FREQ=WEEKLY;BYDAY=SA"
}
}

# 资源使用导出
resource_usage_export_config {
enable_network_egress_metering = true
enable_resource_consumption_metering = true

bigquery_destination {
dataset_id = google_bigquery_dataset.gke_usage.dataset_id
}
}
}

# 节点池
resource "google_container_node_pool" "primary_nodes" {
name = "${var.cluster_name}-node-pool"
location = var.region
cluster = google_container_cluster.primary.name
version = var.kubernetes_version

node_count = var.node_count

# 自动扩缩容
autoscaling {
min_node_count = var.min_node_count
max_node_count = var.max_node_count
}

# 节点配置
node_config {
preemptible = false
machine_type = var.machine_type

# Google服务账号
service_account = google_service_account.kubernetes.email
oauth_scopes = [
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring",
"https://www.googleapis.com/auth/devstorage.read_only"
]

# 标签
labels = {
env = var.environment
}

# 污点
taint {
key = "instance-type"
value = "standard"
effect = "NO_SCHEDULE"
}

# 磁盘配置
disk_size_gb = var.disk_size_gb
disk_type = "pd-standard"

# 网络标签
tags = ["gke-node", "${var.cluster_name}-node"]

# 元数据
metadata = {
disable-legacy-endpoints = "true"
}
}

# 管理配置
management {
auto_repair = true
auto_upgrade = true
}

# 升级设置
upgrade_settings {
max_surge = 1
max_unavailable = 0
}
}

# Spot 实例节点池
resource "google_container_node_pool" "spot_nodes" {
name = "${var.cluster_name}-spot-pool"
location = var.region
cluster = google_container_cluster.primary.name

node_count = 0

autoscaling {
min_node_count = 0
max_node_count = 5
}

node_config {
preemptible = true
machine_type = var.spot_machine_type

service_account = google_service_account.kubernetes.email
oauth_scopes = [
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring",
"https://www.googleapis.com/auth/devstorage.read_only"
]

labels = {
env = var.environment
instance-type = "spot"
}

taint {
key = "instance-type"
value = "spot"
effect = "NO_SCHEDULE"
}

disk_size_gb = 30
disk_type = "pd-standard"

tags = ["gke-node", "${var.cluster_name}-spot-node"]

metadata = {
disable-legacy-endpoints = "true"
}
}

management {
auto_repair = true
auto_upgrade = true
}
}

# 服务账号
resource "google_service_account" "kubernetes" {
account_id = "${var.cluster_name}-sa"
display_name = "Kubernetes Service Account"
}

# BigQuery 数据集
resource "google_bigquery_dataset" "gke_usage" {
dataset_id = "gke_cluster_resource_usage"
friendly_name = "GKE Cluster Resource Usage"
description = "Resource usage data for GKE cluster"
location = var.region
default_table_expiration_ms = 3600000

labels = {
env = var.environment
}
}

监控与可观测性

Prometheus + Grafana 部署

# monitoring-stack.yml
apiVersion: v1
kind: Namespace
metadata:
name: monitoring

---
apiVersion: helm.cattle.io/v1
kind: HelmChart
metadata:
name: prometheus
namespace: kube-system
spec:
chart: kube-prometheus-stack
repo: https://prometheus-community.github.io/helm-charts
targetNamespace: monitoring
valuesContent: |
prometheus:
prometheusSpec:
retention: 30d
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: fast-ssd
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 100Gi
resources:
requests:
memory: 2Gi
cpu: 1000m
limits:
memory: 4Gi
cpu: 2000m
additionalScrapeConfigs:
- job_name: 'istio-mesh'
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- istio-system
relabel_configs:
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: istio-proxy;http-monitoring
- job_name: 'istio-policy'
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- istio-system
relabel_configs:
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: istio-policy;http-monitoring
- job_name: 'istio-telemetry'
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- istio-system
relabel_configs:
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: istio-telemetry;http-monitoring

grafana:
adminPassword: ${GRAFANA_ADMIN_PASSWORD}
persistence:
enabled: true
storageClassName: fast-ssd
size: 10Gi
resources:
requests:
memory: 256Mi
cpu: 100m
limits:
memory: 512Mi
cpu: 200m
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/default
dashboards:
default:
istio-mesh:
gnetId: 7639
revision: 22
datasource: Prometheus
istio-service:
gnetId: 7636
revision: 22
datasource: Prometheus
istio-workload:
gnetId: 7630
revision: 22
datasource: Prometheus
kubernetes-cluster:
gnetId: 7249
revision: 1
datasource: Prometheus

alertmanager:
alertmanagerSpec:
storage:
volumeClaimTemplate:
spec:
storageClassName: fast-ssd
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 10Gi
resources:
requests:
memory: 256Mi
cpu: 100m
limits:
memory: 512Mi
cpu: 200m
config:
global:
smtp_smarthost: 'smtp.gmail.com:587'
smtp_from: 'alerts@example.com'
smtp_auth_username: 'alerts@example.com'
smtp_auth_password: '${SMTP_PASSWORD}'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
- match:
severity: warning
receiver: 'warning-alerts'
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://alertmanager-webhook:5000/'
- name: 'critical-alerts'
email_configs:
- to: 'oncall@example.com'
subject: 'Critical Alert: {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
{{ end }}
slack_configs:
- api_url: '${SLACK_WEBHOOK_URL}'
channel: '#alerts'
title: 'Critical Alert'
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
- name: 'warning-alerts'
email_configs:
- to: 'team@example.com'
subject: 'Warning Alert: {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
{{ end }}

分布式追踪

# jaeger-deployment.yml
apiVersion: v1
kind: Namespace
metadata:
name: observability

---
apiVersion: jaegertracing.io/v1
kind: Jaeger
metadata:
name: jaeger
namespace: observability
spec:
strategy: production

collector:
maxReplicas: 5
resources:
limits:
cpu: 1000m
memory: 1Gi
requests:
cpu: 500m
memory: 512Mi

query:
replicas: 2
resources:
limits:
cpu: 500m
memory: 512Mi
requests:
cpu: 250m
memory: 256Mi

storage:
type: elasticsearch
elasticsearch:
nodeCount: 3
redundancyPolicy: SingleRedundancy
resources:
requests:
cpu: 1000m
memory: 2Gi
limits:
cpu: 2000m
memory: 4Gi
storage:
storageClassName: fast-ssd
size: 100Gi

ingress:
enabled: true
annotations:
kubernetes.io/ingress.class: nginx
cert-manager.io/cluster-issuer: letsencrypt-prod
hosts:
- jaeger.example.com
tls:
- secretName: jaeger-tls
hosts:
- jaeger.example.com

总结

云原生部署通过微服务架构、容器化、服务网格和云平台的结合,为现代应用提供了强大的部署和运维能力。关键要素包括:

核心优势

  1. 可扩展性:水平扩展和弹性伸缩
  2. 可靠性:故障隔离和自动恢复
  3. 可维护性:独立部署和版本管理
  4. 可观测性:全面的监控和追踪
  5. 安全性:零信任网络和身份认证

最佳实践

  1. 架构设计:遵循微服务设计原则
  2. 服务治理:使用服务网格管理服务间通信
  3. 基础设施:采用基础设施即代码
  4. 监控运维:建立完善的可观测性体系
  5. 安全合规:集成安全策略和合规要求

通过遵循这些最佳实践,可以构建一个高效、可靠、安全的云原生应用部署平台。