微服务与云平台部署

概述

云原生部署是现代应用程序开发和部署的核心方法，它结合了微服务架构、容器化、服务网格和云平台的优势，提供了高度可扩展、弹性和可维护的解决方案。

微服务架构设计

架构原则

# microservices-principles.yml
architecture_principles:
  single_responsibility:
    description: "每个服务只负责一个业务功能"
    benefits:
      - "高内聚，低耦合"
      - "独立开发和部署"
      - "技术栈灵活性"
  
  decentralized_governance:
    description: "去中心化的治理模式"
    practices:
      - "数据库分离"
      - "独立的CI/CD流水线"
      - "团队自主决策"
  
  failure_isolation:
    description: "故障隔离和容错设计"
    patterns:
      - "断路器模式"
      - "舱壁模式"
      - "超时和重试"
  
  evolutionary_design:
    description: "演进式设计"
    approaches:
      - "API版本控制"
      - "向后兼容"
      - "渐进式迁移"

服务拆分策略

微服务通信模式

同步通信

// api-gateway.js
const express = require('express');
const httpProxy = require('http-proxy-middleware');
const rateLimit = require('express-rate-limit');
const helmet = require('helmet');
const cors = require('cors');

const app = express();

// 安全中间件
app.use(helmet());
app.use(cors());

// 限流配置
const limiter = rateLimit({
  windowMs: 15 * 60 * 1000, // 15分钟
  max: 100, // 限制每个IP 100次请求
  message: 'Too many requests from this IP'
});
app.use(limiter);

// 服务发现配置
const services = {
  user: {
    target: process.env.USER_SERVICE_URL || 'http://user-service:3001',
    changeOrigin: true,
    pathRewrite: { '^/api/users': '' }
  },
  order: {
    target: process.env.ORDER_SERVICE_URL || 'http://order-service:3002',
    changeOrigin: true,
    pathRewrite: { '^/api/orders': '' }
  },
  payment: {
    target: process.env.PAYMENT_SERVICE_URL || 'http://payment-service:3003',
    changeOrigin: true,
    pathRewrite: { '^/api/payments': '' }
  }
};

// 健康检查
app.get('/health', (req, res) => {
  res.json({ status: 'healthy', timestamp: new Date().toISOString() });
});

// 路由配置
Object.keys(services).forEach(service => {
  const config = services[service];
  
  // 添加请求日志
  const proxyOptions = {
    ...config,
    onProxyReq: (proxyReq, req, res) => {
      console.log(`[${new Date().toISOString()}] ${req.method} ${req.url} -> ${config.target}`);
    },
    onError: (err, req, res) => {
      console.error(`Proxy error for ${service}:`, err.message);
      res.status(503).json({ error: 'Service temporarily unavailable' });
    }
  };
  
  app.use(`/api/${service}s`, httpProxy(proxyOptions));
});

// 错误处理
app.use((err, req, res, next) => {
  console.error('Gateway error:', err);
  res.status(500).json({ error: 'Internal server error' });
});

const PORT = process.env.PORT || 3000;
app.listen(PORT, () => {
  console.log(`API Gateway running on port ${PORT}`);
});

异步通信

// event-bus.js
const amqp = require('amqplib');
const EventEmitter = require('events');

class EventBus extends EventEmitter {
  constructor() {
    super();
    this.connection = null;
    this.channel = null;
    this.exchanges = new Map();
  }

  async connect(url = process.env.RABBITMQ_URL || 'amqp://localhost') {
    try {
      this.connection = await amqp.connect(url);
      this.channel = await this.connection.createChannel();
      
      // 连接错误处理
      this.connection.on('error', (err) => {
        console.error('RabbitMQ connection error:', err);
        this.emit('error', err);
      });
      
      this.connection.on('close', () => {
        console.log('RabbitMQ connection closed');
        this.emit('close');
      });
      
      console.log('Connected to RabbitMQ');
      return this;
    } catch (error) {
      console.error('Failed to connect to RabbitMQ:', error);
      throw error;
    }
  }

  async createExchange(name, type = 'topic', options = { durable: true }) {
    await this.channel.assertExchange(name, type, options);
    this.exchanges.set(name, { type, options });
    return this;
  }

  async publish(exchange, routingKey, message, options = {}) {
    const messageBuffer = Buffer.from(JSON.stringify({
      ...message,
      timestamp: new Date().toISOString(),
      messageId: this.generateMessageId()
    }));
    
    const publishOptions = {
      persistent: true,
      contentType: 'application/json',
      ...options
    };
    
    return this.channel.publish(exchange, routingKey, messageBuffer, publishOptions);
  }

  async subscribe(exchange, routingKey, handler, options = {}) {
    const queueOptions = {
      durable: true,
      exclusive: false,
      autoDelete: false,
      ...options.queue
    };
    
    const { queue } = await this.channel.assertQueue('', queueOptions);
    await this.channel.bindQueue(queue, exchange, routingKey);
    
    const consumeOptions = {
      noAck: false,
      ...options.consume
    };
    
    await this.channel.consume(queue, async (msg) => {
      if (msg) {
        try {
          const content = JSON.parse(msg.content.toString());
          await handler(content, msg);
          this.channel.ack(msg);
        } catch (error) {
          console.error('Message processing error:', error);
          
          // 重试逻辑
          const retryCount = (msg.properties.headers['x-retry-count'] || 0) + 1;
          const maxRetries = options.maxRetries || 3;
          
          if (retryCount <= maxRetries) {
            setTimeout(() => {
              this.channel.publish(exchange, routingKey, msg.content, {
                ...msg.properties,
                headers: {
                  ...msg.properties.headers,
                  'x-retry-count': retryCount
                }
              });
            }, Math.pow(2, retryCount) * 1000); // 指数退避
          } else {
            // 发送到死信队列
            await this.publishToDeadLetter(msg, error);
          }
          
          this.channel.ack(msg);
        }
      }
    }, consumeOptions);
    
    return queue;
  }

  async publishToDeadLetter(originalMsg, error) {
    const deadLetterExchange = 'dead-letter';
    await this.createExchange(deadLetterExchange);
    
    const deadLetterMessage = {
      originalMessage: JSON.parse(originalMsg.content.toString()),
      error: error.message,
      timestamp: new Date().toISOString(),
      routingKey: originalMsg.fields.routingKey
    };
    
    await this.publish(deadLetterExchange, 'failed', deadLetterMessage);
  }

  generateMessageId() {
    return `${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
  }

  async close() {
    if (this.channel) {
      await this.channel.close();
    }
    if (this.connection) {
      await this.connection.close();
    }
  }
}

// 使用示例
const eventBus = new EventBus();

// 订单服务发布事件
class OrderService {
  constructor(eventBus) {
    this.eventBus = eventBus;
  }

  async createOrder(orderData) {
    // 创建订单逻辑
    const order = await this.saveOrder(orderData);
    
    // 发布订单创建事件
    await this.eventBus.publish('orders', 'order.created', {
      orderId: order.id,
      userId: order.userId,
      amount: order.amount,
      items: order.items
    });
    
    return order;
  }

  async saveOrder(orderData) {
    // 模拟保存订单
    return {
      id: Date.now(),
      ...orderData,
      status: 'pending',
      createdAt: new Date()
    };
  }
}

// 库存服务订阅事件
class InventoryService {
  constructor(eventBus) {
    this.eventBus = eventBus;
    this.setupEventHandlers();
  }

  async setupEventHandlers() {
    await this.eventBus.subscribe('orders', 'order.created', 
      this.handleOrderCreated.bind(this), 
      { maxRetries: 3 }
    );
  }

  async handleOrderCreated(event) {
    console.log('Processing order created event:', event);
    
    // 减少库存
    for (const item of event.items) {
      await this.reduceInventory(item.productId, item.quantity);
    }
    
    // 发布库存更新事件
    await this.eventBus.publish('inventory', 'inventory.updated', {
      orderId: event.orderId,
      items: event.items,
      status: 'reserved'
    });
  }

  async reduceInventory(productId, quantity) {
    // 模拟减少库存
    console.log(`Reducing inventory for product ${productId} by ${quantity}`);
  }
}

module.exports = { EventBus, OrderService, InventoryService };

服务网格 (Service Mesh)

Istio 配置

基础配置

# istio-installation.yml
apiVersion: install.istio.io/v1alpha1
kind: IstioOperator
metadata:
  name: control-plane
spec:
  values:
    global:
      meshID: mesh1
      multiCluster:
        clusterName: cluster1
      network: network1
    pilot:
      env:
        EXTERNAL_ISTIOD: false
  components:
    pilot:
      k8s:
        resources:
          requests:
            cpu: 200m
            memory: 128Mi
          limits:
            cpu: 500m
            memory: 512Mi
        hpaSpec:
          minReplicas: 2
          maxReplicas: 5
          metrics:
          - type: Resource
            resource:
              name: cpu
              target:
                type: Utilization
                averageUtilization: 80
    ingressGateways:
    - name: istio-ingressgateway
      enabled: true
      k8s:
        resources:
          requests:
            cpu: 100m
            memory: 128Mi
          limits:
            cpu: 2000m
            memory: 1024Mi
        hpaSpec:
          minReplicas: 2
          maxReplicas: 5
        service:
          type: LoadBalancer
          ports:
          - port: 15021
            targetPort: 15021
            name: status-port
          - port: 80
            targetPort: 8080
            name: http2
          - port: 443
            targetPort: 8443
            name: https
    egressGateways:
    - name: istio-egressgateway
      enabled: true
      k8s:
        resources:
          requests:
            cpu: 100m
            memory: 128Mi
          limits:
            cpu: 2000m
            memory: 1024Mi

流量管理

# traffic-management.yml
---
apiVersion: networking.istio.io/v1beta1
kind: Gateway
metadata:
  name: microservices-gateway
  namespace: production
spec:
  selector:
    istio: ingressgateway
  servers:
  - port:
      number: 80
      name: http
      protocol: HTTP
    hosts:
    - api.example.com
    tls:
      httpsRedirect: true
  - port:
      number: 443
      name: https
      protocol: HTTPS
    tls:
      mode: SIMPLE
      credentialName: api-tls-secret
    hosts:
    - api.example.com

---
apiVersion: networking.istio.io/v1beta1
kind: VirtualService
metadata:
  name: microservices-vs
  namespace: production
spec:
  hosts:
  - api.example.com
  gateways:
  - microservices-gateway
  http:
  - match:
    - uri:
        prefix: /api/users
    route:
    - destination:
        host: user-service
        port:
          number: 3001
      weight: 90
    - destination:
        host: user-service-v2
        port:
          number: 3001
      weight: 10
    fault:
      delay:
        percentage:
          value: 0.1
        fixedDelay: 5s
    retries:
      attempts: 3
      perTryTimeout: 2s
    timeout: 10s
  - match:
    - uri:
        prefix: /api/orders
    route:
    - destination:
        host: order-service
        port:
          number: 3002
    corsPolicy:
      allowOrigins:
      - exact: https://app.example.com
      allowMethods:
      - GET
      - POST
      - PUT
      - DELETE
      allowHeaders:
      - authorization
      - content-type
      maxAge: 24h

---
apiVersion: networking.istio.io/v1beta1
kind: DestinationRule
metadata:
  name: user-service-dr
  namespace: production
spec:
  host: user-service
  trafficPolicy:
    connectionPool:
      tcp:
        maxConnections: 100
      http:
        http1MaxPendingRequests: 50
        http2MaxRequests: 100
        maxRequestsPerConnection: 10
        maxRetries: 3
        consecutiveGatewayErrors: 5
        interval: 30s
        baseEjectionTime: 30s
        maxEjectionPercent: 50
    loadBalancer:
      simple: LEAST_CONN
    outlierDetection:
      consecutiveGatewayErrors: 3
      consecutive5xxErrors: 3
      interval: 30s
      baseEjectionTime: 30s
      maxEjectionPercent: 50
  subsets:
  - name: v1
    labels:
      version: v1
  - name: v2
    labels:
      version: v2
    trafficPolicy:
      connectionPool:
        tcp:
          maxConnections: 50

安全策略

# security-policies.yml
---
apiVersion: security.istio.io/v1beta1
kind: PeerAuthentication
metadata:
  name: default
  namespace: production
spec:
  mtls:
    mode: STRICT

---
apiVersion: security.istio.io/v1beta1
kind: AuthorizationPolicy
metadata:
  name: user-service-authz
  namespace: production
spec:
  selector:
    matchLabels:
      app: user-service
  rules:
  - from:
    - source:
        principals: ["cluster.local/ns/production/sa/api-gateway"]
  - to:
    - operation:
        methods: ["GET"]
        paths: ["/health", "/metrics"]
  - from:
    - source:
        requestPrincipals: ["*"]
    to:
    - operation:
        methods: ["GET", "POST", "PUT", "DELETE"]
    when:
    - key: request.headers[authorization]
      values: ["Bearer *"]

---
apiVersion: security.istio.io/v1beta1
kind: RequestAuthentication
metadata:
  name: jwt-auth
  namespace: production
spec:
  selector:
    matchLabels:
      app: user-service
  jwtRules:
  - issuer: "https://auth.example.com"
    jwksUri: "https://auth.example.com/.well-known/jwks.json"
    audiences:
    - "api.example.com"
    forwardOriginalToken: true

云平台部署

AWS EKS 部署

Terraform 配置

# eks-cluster.tf
terraform {
  required_version = ">= 1.0"
  required_providers {
    aws = {
      source  = "hashicorp/aws"
      version = "~> 5.0"
    }
    kubernetes = {
      source  = "hashicorp/kubernetes"
      version = "~> 2.20"
    }
    helm = {
      source  = "hashicorp/helm"
      version = "~> 2.10"
    }
  }
}

provider "aws" {
  region = var.aws_region
}

# VPC 配置
module "vpc" {
  source = "terraform-aws-modules/vpc/aws"
  
  name = "${var.cluster_name}-vpc"
  cidr = var.vpc_cidr
  
  azs             = data.aws_availability_zones.available.names
  private_subnets = var.private_subnets
  public_subnets  = var.public_subnets
  
  enable_nat_gateway   = true
  single_nat_gateway   = false
  enable_dns_hostnames = true
  enable_dns_support   = true
  
  public_subnet_tags = {
    "kubernetes.io/role/elb" = "1"
  }
  
  private_subnet_tags = {
    "kubernetes.io/role/internal-elb" = "1"
  }
  
  tags = {
    "kubernetes.io/cluster/${var.cluster_name}" = "shared"
  }
}

# EKS 集群
module "eks" {
  source = "terraform-aws-modules/eks/aws"
  
  cluster_name    = var.cluster_name
  cluster_version = var.kubernetes_version
  
  vpc_id                         = module.vpc.vpc_id
  subnet_ids                     = module.vpc.private_subnets
  cluster_endpoint_public_access = true
  
  # 集群加密
  cluster_encryption_config = {
    provider_key_arn = aws_kms_key.eks.arn
    resources        = ["secrets"]
  }
  
  # 集群日志
  cluster_enabled_log_types = ["api", "audit", "authenticator", "controllerManager", "scheduler"]
  
  # 节点组
  eks_managed_node_groups = {
    main = {
      name = "main-node-group"
      
      instance_types = ["t3.medium"]
      capacity_type  = "ON_DEMAND"
      
      min_size     = 2
      max_size     = 10
      desired_size = 3
      
      ami_type = "AL2_x86_64"
      
      # 启动模板
      launch_template_name    = "main-node-group"
      launch_template_version = "$Latest"
      
      # 磁盘配置
      block_device_mappings = {
        xvda = {
          device_name = "/dev/xvda"
          ebs = {
            volume_size           = 50
            volume_type           = "gp3"
            iops                  = 3000
            throughput            = 150
            encrypted             = true
            kms_key_id            = aws_kms_key.ebs.arn
            delete_on_termination = true
          }
        }
      }
      
      # 网络配置
      remote_access = {
        ec2_ssh_key               = var.ec2_ssh_key
        source_security_group_ids = [aws_security_group.remote_access.id]
      }
      
      # 标签
      tags = {
        Environment = var.environment
        NodeGroup   = "main"
      }
    }
    
    spot = {
      name = "spot-node-group"
      
      instance_types = ["t3.medium", "t3.large"]
      capacity_type  = "SPOT"
      
      min_size     = 0
      max_size     = 5
      desired_size = 2
      
      # Spot 实例配置
      use_mixed_instances_policy = true
      mixed_instances_policy = {
        instances_distribution = {
          on_demand_base_capacity                  = 0
          on_demand_percentage_above_base_capacity = 0
          spot_allocation_strategy                 = "capacity-optimized"
        }
        
        override = [
          {
            instance_type     = "t3.medium"
            weighted_capacity = "1"
          },
          {
            instance_type     = "t3.large"
            weighted_capacity = "2"
          }
        ]
      }
      
      # 污点配置
      taints = {
        spot = {
          key    = "spot"
          value  = "true"
          effect = "NO_SCHEDULE"
        }
      }
      
      tags = {
        Environment = var.environment
        NodeGroup   = "spot"
      }
    }
  }
  
  # AWS Load Balancer Controller
  enable_aws_load_balancer_controller = true
  
  # EBS CSI Driver
  enable_ebs_csi_driver = true
  
  tags = {
    Environment = var.environment
    Terraform   = "true"
  }
}

# KMS 密钥
resource "aws_kms_key" "eks" {
  description             = "EKS Secret Encryption Key"
  deletion_window_in_days = 7
  enable_key_rotation     = true
  
  tags = {
    Name = "${var.cluster_name}-eks-key"
  }
}

resource "aws_kms_alias" "eks" {
  name          = "alias/${var.cluster_name}-eks"
  target_key_id = aws_kms_key.eks.key_id
}

resource "aws_kms_key" "ebs" {
  description             = "EBS Encryption Key"
  deletion_window_in_days = 7
  enable_key_rotation     = true
  
  tags = {
    Name = "${var.cluster_name}-ebs-key"
  }
}

# 安全组
resource "aws_security_group" "remote_access" {
  name_prefix = "${var.cluster_name}-remote-access"
  vpc_id      = module.vpc.vpc_id
  
  ingress {
    description = "SSH"
    from_port   = 22
    to_port     = 22
    protocol    = "tcp"
    cidr_blocks = var.allowed_cidr_blocks
  }
  
  egress {
    from_port   = 0
    to_port     = 0
    protocol    = "-1"
    cidr_blocks = ["0.0.0.0/0"]
  }
  
  tags = {
    Name = "${var.cluster_name}-remote-access"
  }
}

# 数据源
data "aws_availability_zones" "available" {
  filter {
    name   = "opt-in-status"
    values = ["opt-in-not-required"]
  }
}

data "aws_caller_identity" "current" {}

变量定义

# variables.tf
variable "aws_region" {
  description = "AWS region"
  type        = string
  default     = "us-west-2"
}

variable "cluster_name" {
  description = "Name of the EKS cluster"
  type        = string
  default     = "microservices-cluster"
}

variable "kubernetes_version" {
  description = "Kubernetes version"
  type        = string
  default     = "1.27"
}

variable "environment" {
  description = "Environment name"
  type        = string
  default     = "production"
}

variable "vpc_cidr" {
  description = "CIDR block for VPC"
  type        = string
  default     = "10.0.0.0/16"
}

variable "private_subnets" {
  description = "Private subnet CIDR blocks"
  type        = list(string)
  default     = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"]
}

variable "public_subnets" {
  description = "Public subnet CIDR blocks"
  type        = list(string)
  default     = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"]
}

variable "ec2_ssh_key" {
  description = "EC2 Key Pair name for SSH access"
  type        = string
  default     = ""
}

variable "allowed_cidr_blocks" {
  description = "CIDR blocks allowed for remote access"
  type        = list(string)
  default     = ["10.0.0.0/8"]
}

Google Cloud GKE 部署

# gke-cluster.tf
terraform {
  required_providers {
    google = {
      source  = "hashicorp/google"
      version = "~> 4.0"
    }
  }
}

provider "google" {
  project = var.project_id
  region  = var.region
}

# VPC 网络
resource "google_compute_network" "vpc" {
  name                    = "${var.cluster_name}-vpc"
  auto_create_subnetworks = false
  routing_mode           = "GLOBAL"
}

# 子网
resource "google_compute_subnetwork" "subnet" {
  name          = "${var.cluster_name}-subnet"
  ip_cidr_range = var.subnet_cidr
  region        = var.region
  network       = google_compute_network.vpc.id
  
  secondary_ip_range {
    range_name    = "services-range"
    ip_cidr_range = var.services_cidr
  }
  
  secondary_ip_range {
    range_name    = "pod-ranges"
    ip_cidr_range = var.pod_cidr
  }
  
  private_ip_google_access = true
}

# GKE 集群
resource "google_container_cluster" "primary" {
  name     = var.cluster_name
  location = var.region
  
  # 移除默认节点池
  remove_default_node_pool = true
  initial_node_count       = 1
  
  # 网络配置
  network    = google_compute_network.vpc.name
  subnetwork = google_compute_subnetwork.subnet.name
  
  # IP 分配策略
  ip_allocation_policy {
    cluster_secondary_range_name  = "services-range"
    services_secondary_range_name = "pod-ranges"
  }
  
  # 网络策略
  network_policy {
    enabled = true
  }
  
  # 主版本配置
  master_version = var.kubernetes_version
  
  # 主节点授权网络
  master_authorized_networks_config {
    dynamic "cidr_blocks" {
      for_each = var.authorized_networks
      content {
        cidr_block   = cidr_blocks.value.cidr_block
        display_name = cidr_blocks.value.display_name
      }
    }
  }
  
  # 私有集群配置
  private_cluster_config {
    enable_private_nodes    = true
    enable_private_endpoint = false
    master_ipv4_cidr_block  = var.master_cidr
  }
  
  # 工作负载身份
  workload_identity_config {
    workload_pool = "${var.project_id}.svc.id.goog"
  }
  
  # 集群功能
  addons_config {
    http_load_balancing {
      disabled = false
    }
    
    horizontal_pod_autoscaling {
      disabled = false
    }
    
    network_policy_config {
      disabled = false
    }
    
    istio_config {
      disabled = false
      auth     = "AUTH_MUTUAL_TLS"
    }
  }
  
  # 维护策略
  maintenance_policy {
    recurring_window {
      start_time = "2023-01-01T02:00:00Z"
      end_time   = "2023-01-01T06:00:00Z"
      recurrence = "FREQ=WEEKLY;BYDAY=SA"
    }
  }
  
  # 资源使用导出
  resource_usage_export_config {
    enable_network_egress_metering       = true
    enable_resource_consumption_metering = true
    
    bigquery_destination {
      dataset_id = google_bigquery_dataset.gke_usage.dataset_id
    }
  }
}

# 节点池
resource "google_container_node_pool" "primary_nodes" {
  name       = "${var.cluster_name}-node-pool"
  location   = var.region
  cluster    = google_container_cluster.primary.name
  version    = var.kubernetes_version
  
  node_count = var.node_count
  
  # 自动扩缩容
  autoscaling {
    min_node_count = var.min_node_count
    max_node_count = var.max_node_count
  }
  
  # 节点配置
  node_config {
    preemptible  = false
    machine_type = var.machine_type
    
    # Google服务账号
    service_account = google_service_account.kubernetes.email
    oauth_scopes = [
      "https://www.googleapis.com/auth/logging.write",
      "https://www.googleapis.com/auth/monitoring",
      "https://www.googleapis.com/auth/devstorage.read_only"
    ]
    
    # 标签
    labels = {
      env = var.environment
    }
    
    # 污点
    taint {
      key    = "instance-type"
      value  = "standard"
      effect = "NO_SCHEDULE"
    }
    
    # 磁盘配置
    disk_size_gb = var.disk_size_gb
    disk_type    = "pd-standard"
    
    # 网络标签
    tags = ["gke-node", "${var.cluster_name}-node"]
    
    # 元数据
    metadata = {
      disable-legacy-endpoints = "true"
    }
  }
  
  # 管理配置
  management {
    auto_repair  = true
    auto_upgrade = true
  }
  
  # 升级设置
  upgrade_settings {
    max_surge       = 1
    max_unavailable = 0
  }
}

# Spot 实例节点池
resource "google_container_node_pool" "spot_nodes" {
  name       = "${var.cluster_name}-spot-pool"
  location   = var.region
  cluster    = google_container_cluster.primary.name
  
  node_count = 0
  
  autoscaling {
    min_node_count = 0
    max_node_count = 5
  }
  
  node_config {
    preemptible  = true
    machine_type = var.spot_machine_type
    
    service_account = google_service_account.kubernetes.email
    oauth_scopes = [
      "https://www.googleapis.com/auth/logging.write",
      "https://www.googleapis.com/auth/monitoring",
      "https://www.googleapis.com/auth/devstorage.read_only"
    ]
    
    labels = {
      env           = var.environment
      instance-type = "spot"
    }
    
    taint {
      key    = "instance-type"
      value  = "spot"
      effect = "NO_SCHEDULE"
    }
    
    disk_size_gb = 30
    disk_type    = "pd-standard"
    
    tags = ["gke-node", "${var.cluster_name}-spot-node"]
    
    metadata = {
      disable-legacy-endpoints = "true"
    }
  }
  
  management {
    auto_repair  = true
    auto_upgrade = true
  }
}

# 服务账号
resource "google_service_account" "kubernetes" {
  account_id   = "${var.cluster_name}-sa"
  display_name = "Kubernetes Service Account"
}

# BigQuery 数据集
resource "google_bigquery_dataset" "gke_usage" {
  dataset_id                  = "gke_cluster_resource_usage"
  friendly_name               = "GKE Cluster Resource Usage"
  description                 = "Resource usage data for GKE cluster"
  location                    = var.region
  default_table_expiration_ms = 3600000
  
  labels = {
    env = var.environment
  }
}

监控与可观测性

Prometheus + Grafana 部署

# monitoring-stack.yml
apiVersion: v1
kind: Namespace
metadata:
  name: monitoring

---
apiVersion: helm.cattle.io/v1
kind: HelmChart
metadata:
  name: prometheus
  namespace: kube-system
spec:
  chart: kube-prometheus-stack
  repo: https://prometheus-community.github.io/helm-charts
  targetNamespace: monitoring
  valuesContent: |
    prometheus:
      prometheusSpec:
        retention: 30d
        storageSpec:
          volumeClaimTemplate:
            spec:
              storageClassName: fast-ssd
              accessModes: ["ReadWriteOnce"]
              resources:
                requests:
                  storage: 100Gi
        resources:
          requests:
            memory: 2Gi
            cpu: 1000m
          limits:
            memory: 4Gi
            cpu: 2000m
        additionalScrapeConfigs:
        - job_name: 'istio-mesh'
          kubernetes_sd_configs:
          - role: endpoints
            namespaces:
              names:
              - istio-system
          relabel_configs:
          - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
            action: keep
            regex: istio-proxy;http-monitoring
        - job_name: 'istio-policy'
          kubernetes_sd_configs:
          - role: endpoints
            namespaces:
              names:
              - istio-system
          relabel_configs:
          - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
            action: keep
            regex: istio-policy;http-monitoring
        - job_name: 'istio-telemetry'
          kubernetes_sd_configs:
          - role: endpoints
            namespaces:
              names:
              - istio-system
          relabel_configs:
          - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
            action: keep
            regex: istio-telemetry;http-monitoring
    
    grafana:
      adminPassword: ${GRAFANA_ADMIN_PASSWORD}
      persistence:
        enabled: true
        storageClassName: fast-ssd
        size: 10Gi
      resources:
        requests:
          memory: 256Mi
          cpu: 100m
        limits:
          memory: 512Mi
          cpu: 200m
      dashboardProviders:
        dashboardproviders.yaml:
          apiVersion: 1
          providers:
          - name: 'default'
            orgId: 1
            folder: ''
            type: file
            disableDeletion: false
            editable: true
            options:
              path: /var/lib/grafana/dashboards/default
      dashboards:
        default:
          istio-mesh:
            gnetId: 7639
            revision: 22
            datasource: Prometheus
          istio-service:
            gnetId: 7636
            revision: 22
            datasource: Prometheus
          istio-workload:
            gnetId: 7630
            revision: 22
            datasource: Prometheus
          kubernetes-cluster:
            gnetId: 7249
            revision: 1
            datasource: Prometheus
    
    alertmanager:
      alertmanagerSpec:
        storage:
          volumeClaimTemplate:
            spec:
              storageClassName: fast-ssd
              accessModes: ["ReadWriteOnce"]
              resources:
                requests:
                  storage: 10Gi
        resources:
          requests:
            memory: 256Mi
            cpu: 100m
          limits:
            memory: 512Mi
            cpu: 200m
      config:
        global:
          smtp_smarthost: 'smtp.gmail.com:587'
          smtp_from: 'alerts@example.com'
          smtp_auth_username: 'alerts@example.com'
          smtp_auth_password: '${SMTP_PASSWORD}'
        route:
          group_by: ['alertname']
          group_wait: 10s
          group_interval: 10s
          repeat_interval: 1h
          receiver: 'web.hook'
          routes:
          - match:
              severity: critical
            receiver: 'critical-alerts'
          - match:
              severity: warning
            receiver: 'warning-alerts'
        receivers:
        - name: 'web.hook'
          webhook_configs:
          - url: 'http://alertmanager-webhook:5000/'
        - name: 'critical-alerts'
          email_configs:
          - to: 'oncall@example.com'
            subject: 'Critical Alert: {{ .GroupLabels.alertname }}'
            body: |
              {{ range .Alerts }}
              Alert: {{ .Annotations.summary }}
              Description: {{ .Annotations.description }}
              {{ end }}
          slack_configs:
          - api_url: '${SLACK_WEBHOOK_URL}'
            channel: '#alerts'
            title: 'Critical Alert'
            text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
        - name: 'warning-alerts'
          email_configs:
          - to: 'team@example.com'
            subject: 'Warning Alert: {{ .GroupLabels.alertname }}'
            body: |
              {{ range .Alerts }}
              Alert: {{ .Annotations.summary }}
              Description: {{ .Annotations.description }}
              {{ end }}

分布式追踪

# jaeger-deployment.yml
apiVersion: v1
kind: Namespace
metadata:
  name: observability

---
apiVersion: jaegertracing.io/v1
kind: Jaeger
metadata:
  name: jaeger
  namespace: observability
spec:
  strategy: production
  
  collector:
    maxReplicas: 5
    resources:
      limits:
        cpu: 1000m
        memory: 1Gi
      requests:
        cpu: 500m
        memory: 512Mi
  
  query:
    replicas: 2
    resources:
      limits:
        cpu: 500m
        memory: 512Mi
      requests:
        cpu: 250m
        memory: 256Mi
  
  storage:
    type: elasticsearch
    elasticsearch:
      nodeCount: 3
      redundancyPolicy: SingleRedundancy
      resources:
        requests:
          cpu: 1000m
          memory: 2Gi
        limits:
          cpu: 2000m
          memory: 4Gi
      storage:
        storageClassName: fast-ssd
        size: 100Gi
  
  ingress:
    enabled: true
    annotations:
      kubernetes.io/ingress.class: nginx
      cert-manager.io/cluster-issuer: letsencrypt-prod
    hosts:
    - jaeger.example.com
    tls:
    - secretName: jaeger-tls
      hosts:
      - jaeger.example.com

总结

云原生部署通过微服务架构、容器化、服务网格和云平台的结合，为现代应用提供了强大的部署和运维能力。关键要素包括：

核心优势

可扩展性：水平扩展和弹性伸缩
可靠性：故障隔离和自动恢复
可维护性：独立部署和版本管理
可观测性：全面的监控和追踪
安全性：零信任网络和身份认证

最佳实践

架构设计：遵循微服务设计原则
服务治理：使用服务网格管理服务间通信
基础设施：采用基础设施即代码
监控运维：建立完善的可观测性体系
安全合规：集成安全策略和合规要求

通过遵循这些最佳实践，可以构建一个高效、可靠、安全的云原生应用部署平台。

概述​

微服务架构设计​

架构原则​

服务拆分策略​

微服务通信模式​

同步通信​

异步通信​

服务网格 (Service Mesh)​

Istio 配置​

基础配置​

流量管理​

安全策略​

云平台部署​

AWS EKS 部署​

Terraform 配置​

变量定义​

Google Cloud GKE 部署​

监控与可观测性​

Prometheus + Grafana 部署​

分布式追踪​

总结​

核心优势​

最佳实践​

概述

微服务架构设计

架构原则

服务拆分策略

微服务通信模式

同步通信

异步通信

服务网格 (Service Mesh)

Istio 配置

基础配置

流量管理

安全策略

云平台部署

AWS EKS 部署

Terraform 配置

变量定义

Google Cloud GKE 部署

监控与可观测性

Prometheus + Grafana 部署

分布式追踪

总结

核心优势

最佳实践