30 KiB
30 KiB
阶段6:生产部署和监控
阶段概述
时间: 0.5天
目标: 将迁移后的系统部署到生产环境,配置监控和告警,确保系统稳定运行
优先级: P0 (最高优先级)
详细任务清单
6.1 生产环境准备 (0.1天)
任务描述
准备生产环境的SQL Server和应用程序部署
具体工作
- 生产环境SQL Server配置
- 生产数据库创建和配置
- 应用程序配置文件准备
- SSL证书和安全配置
生产环境配置
1. 生产SQL Server配置
-- 创建生产数据库
CREATE DATABASE [honey_box_prod]
ON (
NAME = 'honey_box_prod_data',
FILENAME = 'D:\Database\Data\honey_box_prod.mdf',
SIZE = 1GB,
MAXSIZE = 100GB,
FILEGROWTH = 100MB
)
LOG ON (
NAME = 'honey_box_prod_log',
FILENAME = 'D:\Database\Logs\honey_box_prod.ldf',
SIZE = 200MB,
MAXSIZE = 10GB,
FILEGROWTH = 10%
);
-- 生产环境数据库配置
ALTER DATABASE [honey_box_prod] SET RECOVERY FULL; -- 生产环境使用完整恢复模式
ALTER DATABASE [honey_box_prod] SET AUTO_CREATE_STATISTICS ON;
ALTER DATABASE [honey_box_prod] SET AUTO_UPDATE_STATISTICS ON;
ALTER DATABASE [honey_box_prod] SET AUTO_UPDATE_STATISTICS_ASYNC ON;
ALTER DATABASE [honey_box_prod] SET PAGE_VERIFY CHECKSUM;
ALTER DATABASE [honey_box_prod] SET ALLOW_SNAPSHOT_ISOLATION ON;
ALTER DATABASE [honey_box_prod] SET READ_COMMITTED_SNAPSHOT ON;
-- 设置数据库排序规则
ALTER DATABASE [honey_box_prod] COLLATE Chinese_PRC_CI_AS;
-- 创建生产环境用户
CREATE LOGIN [honey_box_prod_app] WITH PASSWORD = 'HoneyBoxProd2024!@#$%';
USE [honey_box_prod];
CREATE USER [honey_box_prod_app] FOR LOGIN [honey_box_prod_app];
-- 分配权限
ALTER ROLE [db_datareader] ADD MEMBER [honey_box_prod_app];
ALTER ROLE [db_datawriter] ADD MEMBER [honey_box_prod_app];
ALTER ROLE [db_ddladmin] ADD MEMBER [honey_box_prod_app];
-- 创建只读用户(用于报表和监控)
CREATE LOGIN [honey_box_prod_readonly] WITH PASSWORD = 'HoneyBoxProdRead2024!@#$%';
CREATE USER [honey_box_prod_readonly] FOR LOGIN [honey_box_prod_readonly];
ALTER ROLE [db_datareader] ADD MEMBER [honey_box_prod_readonly];
-- 性能优化配置
EXEC sp_configure 'show advanced options', 1;
RECONFIGURE;
-- 设置最大内存 (根据服务器配置调整,预留系统内存)
EXEC sp_configure 'max server memory (MB)', 8192; -- 8GB
RECONFIGURE;
-- 设置并行度 (根据CPU核心数调整)
EXEC sp_configure 'max degree of parallelism', 8;
RECONFIGURE;
-- 设置成本阈值
EXEC sp_configure 'cost threshold for parallelism', 50;
RECONFIGURE;
-- 启用优化功能
EXEC sp_configure 'optimize for ad hoc workloads', 1;
RECONFIGURE;
2. 生产环境应用配置
// appsettings.Production.json
{
"ConnectionStrings": {
"DefaultConnection": "Server=prod-sql-server;Database=honey_box_prod;User Id=honey_box_prod_app;Password=HoneyBoxProd2024!@#$%;TrustServerCertificate=false;MultipleActiveResultSets=true;Encrypt=true;Connection Timeout=30;Command Timeout=60;",
"ReadOnlyConnection": "Server=prod-sql-server;Database=honey_box_prod;User Id=honey_box_prod_readonly;Password=HoneyBoxProdRead2024!@#$%;TrustServerCertificate=false;Encrypt=true;Connection Timeout=30;Command Timeout=120;"
},
"DatabaseSettings": {
"CommandTimeout": 60,
"EnableSensitiveDataLogging": false,
"EnableDetailedErrors": false,
"MaxRetryCount": 3,
"MaxRetryDelay": "00:00:05"
},
"Logging": {
"LogLevel": {
"Default": "Information",
"Microsoft.AspNetCore": "Warning",
"Microsoft.EntityFrameworkCore": "Warning",
"HoneyBox": "Information"
}
},
"Cache": {
"Redis": {
"ConnectionString": "prod-redis-server:6379",
"Database": 0,
"KeyPrefix": "honeybox:prod:"
},
"Memory": {
"SizeLimit": 104857600,
"CompactionPercentage": 0.25
}
},
"Monitoring": {
"ApplicationInsights": {
"ConnectionString": "InstrumentationKey=your-app-insights-key"
},
"HealthChecks": {
"Enabled": true,
"DatabaseTimeout": "00:00:30",
"CacheTimeout": "00:00:10"
}
},
"Security": {
"RequireHttps": true,
"HstsMaxAge": "365.00:00:00",
"ContentSecurityPolicy": "default-src 'self'; script-src 'self' 'unsafe-inline'; style-src 'self' 'unsafe-inline';"
}
}
6.2 数据迁移执行 (0.2天)
任务描述
执行生产环境的数据迁移
具体工作
- 生产数据备份
- 执行表结构创建
- 执行数据迁移
- 数据验证和确认
生产迁移脚本
1. 生产迁移执行计划
# 生产环境迁移脚本
# Production-Migration.ps1
param(
[string]$SourceServer = "prod-mysql-server",
[string]$SourceDatabase = "youdas",
[string]$TargetServer = "prod-sql-server",
[string]$TargetDatabase = "honey_box_prod",
[string]$BackupPath = "D:\Backup\Migration",
[switch]$DryRun = $false
)
$ErrorActionPreference = "Stop"
$StartTime = Get-Date
Write-Host "=== 生产环境数据库迁移开始 ===" -ForegroundColor Green
Write-Host "开始时间: $StartTime"
Write-Host "源数据库: $SourceServer/$SourceDatabase"
Write-Host "目标数据库: $TargetServer/$TargetDatabase"
Write-Host "备份路径: $BackupPath"
Write-Host "试运行模式: $DryRun"
Write-Host ""
# 1. 创建备份目录
$BackupDir = Join-Path $BackupPath (Get-Date -Format "yyyyMMdd_HHmmss")
New-Item -ItemType Directory -Path $BackupDir -Force | Out-Null
Write-Host "创建备份目录: $BackupDir" -ForegroundColor Yellow
# 2. 备份MySQL数据
Write-Host "开始备份MySQL数据..." -ForegroundColor Yellow
$MySQLBackupFile = Join-Path $BackupDir "mysql_backup.sql"
$MySQLBackupCmd = "mysqldump -h $SourceServer -u root -p --single-transaction --routines --triggers --events --hex-blob --default-character-set=utf8mb4 $SourceDatabase > `"$MySQLBackupFile`""
if (-not $DryRun) {
Invoke-Expression $MySQLBackupCmd
if ($LASTEXITCODE -eq 0) {
Write-Host "MySQL备份完成: $MySQLBackupFile" -ForegroundColor Green
} else {
throw "MySQL备份失败"
}
} else {
Write-Host "[试运行] 将执行MySQL备份: $MySQLBackupCmd" -ForegroundColor Cyan
}
# 3. 导出核心表数据为CSV
Write-Host "导出核心表数据..." -ForegroundColor Yellow
$CSVExportDir = Join-Path $BackupDir "csv_export"
New-Item -ItemType Directory -Path $CSVExportDir -Force | Out-Null
$CoreTables = @("user", "user_account", "goods", "goods_list", "order", "order_list", "profit_money", "profit_integral", "profit_pay", "coupon", "coupon_receive", "task_list", "user_task_list", "shang", "config")
foreach ($table in $CoreTables) {
$csvFile = Join-Path $CSVExportDir "$table.csv"
$exportCmd = "mysql -h $SourceServer -u root -p --default-character-set=utf8mb4 -e `"SELECT * FROM $table;`" --batch --raw $SourceDatabase > `"$csvFile`""
if (-not $DryRun) {
Invoke-Expression $exportCmd
if ($LASTEXITCODE -eq 0) {
$recordCount = (Get-Content $csvFile | Measure-Object -Line).Lines - 1
Write-Host "导出 $table : $recordCount 条记录" -ForegroundColor Green
} else {
Write-Warning "导出 $table 失败"
}
} else {
Write-Host "[试运行] 将导出表: $table" -ForegroundColor Cyan
}
}
# 4. 创建SQL Server表结构
Write-Host "创建SQL Server表结构..." -ForegroundColor Yellow
$CreateTablesScript = Join-Path $PSScriptRoot "CreateTables.sql"
if (-not $DryRun) {
sqlcmd -S $TargetServer -d $TargetDatabase -E -i $CreateTablesScript
if ($LASTEXITCODE -eq 0) {
Write-Host "表结构创建完成" -ForegroundColor Green
} else {
throw "表结构创建失败"
}
} else {
Write-Host "[试运行] 将创建表结构: $CreateTablesScript" -ForegroundColor Cyan
}
# 5. 执行数据迁移
Write-Host "执行数据迁移..." -ForegroundColor Yellow
$MigrateDataScript = Join-Path $PSScriptRoot "MigrateData.sql"
if (-not $DryRun) {
# 替换脚本中的CSV路径
$scriptContent = Get-Content $MigrateDataScript -Raw
$scriptContent = $scriptContent.Replace('{{CSV_EXPORT_DIR}}', $CSVExportDir)
$tempScript = Join-Path $BackupDir "MigrateData_temp.sql"
$scriptContent | Out-File -FilePath $tempScript -Encoding UTF8
sqlcmd -S $TargetServer -d $TargetDatabase -E -i $tempScript
if ($LASTEXITCODE -eq 0) {
Write-Host "数据迁移完成" -ForegroundColor Green
} else {
throw "数据迁移失败"
}
} else {
Write-Host "[试运行] 将执行数据迁移: $MigrateDataScript" -ForegroundColor Cyan
}
# 6. 数据验证
Write-Host "执行数据验证..." -ForegroundColor Yellow
$ValidateDataScript = Join-Path $PSScriptRoot "ValidateData.sql"
if (-not $DryRun) {
$validationResult = sqlcmd -S $TargetServer -d $TargetDatabase -E -i $ValidateDataScript -h -1
Write-Host "数据验证结果:" -ForegroundColor Yellow
Write-Host $validationResult
} else {
Write-Host "[试运行] 将执行数据验证: $ValidateDataScript" -ForegroundColor Cyan
}
$EndTime = Get-Date
$Duration = $EndTime - $StartTime
Write-Host ""
Write-Host "=== 生产环境数据库迁移完成 ===" -ForegroundColor Green
Write-Host "结束时间: $EndTime"
Write-Host "总耗时: $($Duration.ToString('hh\:mm\:ss'))"
Write-Host "备份位置: $BackupDir"
if ($DryRun) {
Write-Host ""
Write-Host "这是试运行模式,没有实际执行迁移操作" -ForegroundColor Cyan
Write-Host "请检查上述计划,确认无误后去掉 -DryRun 参数执行实际迁移" -ForegroundColor Cyan
}
2. 数据验证脚本
-- ValidateData.sql
-- 生产环境数据验证脚本
PRINT '=== 开始数据验证 ===';
PRINT '验证时间: ' + CONVERT(VARCHAR, GETDATE(), 120);
PRINT '';
-- 1. 基础记录数验证
PRINT '1. 基础记录数验证';
PRINT '表名 记录数 最大ID 最早时间 最晚时间';
PRINT '--------------------------------------------------------------------------------';
SELECT
CONCAT(
LEFT(table_name + SPACE(20), 20),
RIGHT(SPACE(10) + CAST(record_count AS VARCHAR), 10),
RIGHT(SPACE(10) + CAST(max_id AS VARCHAR), 10),
' ',
ISNULL(CONVERT(VARCHAR, min_time, 120), 'NULL'),
' ',
ISNULL(CONVERT(VARCHAR, max_time, 120), 'NULL')
) as validation_result
FROM (
SELECT 'users' as table_name, COUNT(*) as record_count, MAX(id) as max_id, MIN(created_at) as min_time, MAX(created_at) as max_time FROM users
UNION ALL
SELECT 'goods' as table_name, COUNT(*) as record_count, MAX(id) as max_id, MIN(created_at) as min_time, MAX(created_at) as max_time FROM goods
UNION ALL
SELECT 'orders' as table_name, COUNT(*) as record_count, MAX(id) as max_id, MIN(created_at) as min_time, MAX(created_at) as max_time FROM orders
UNION ALL
SELECT 'order_items' as table_name, COUNT(*) as record_count, MAX(id) as max_id, MIN(created_at) as min_time, MAX(created_at) as max_time FROM order_items
UNION ALL
SELECT 'goods_items' as table_name, COUNT(*) as record_count, MAX(id) as max_id, MIN(created_at) as min_time, MAX(created_at) as max_time FROM goods_items
) t
ORDER BY table_name;
PRINT '';
-- 2. 业务数据一致性验证
PRINT '2. 业务数据一致性验证';
PRINT '指标 数值';
PRINT '------------------------------------------------';
SELECT
CONCAT(LEFT(metric + SPACE(30), 30), value) as result
FROM (
SELECT '用户总数' as metric, CAST(COUNT(*) AS VARCHAR) as value FROM users
UNION ALL
SELECT '活跃用户数' as metric, CAST(COUNT(*) AS VARCHAR) as value FROM users WHERE status = 1
UNION ALL
SELECT '用户总余额' as metric, CAST(SUM(money) AS VARCHAR) as value FROM users
UNION ALL
SELECT '用户总积分' as metric, CAST(SUM(integral) AS VARCHAR) as value FROM users
UNION ALL
SELECT '商品总数' as metric, CAST(COUNT(*) AS VARCHAR) as value FROM goods
UNION ALL
SELECT '上架商品数' as metric, CAST(COUNT(*) AS VARCHAR) as value FROM goods WHERE status = 1
UNION ALL
SELECT '订单总数' as metric, CAST(COUNT(*) AS VARCHAR) as value FROM orders
UNION ALL
SELECT '已支付订单数' as metric, CAST(COUNT(*) AS VARCHAR) as value FROM orders WHERE status = 1
UNION ALL
SELECT '订单总金额' as metric, CAST(SUM(order_total) AS VARCHAR) as value FROM orders WHERE status = 1
) t;
PRINT '';
-- 3. 数据完整性验证
PRINT '3. 数据完整性验证';
PRINT '检查项 异常数量';
PRINT '------------------------------------------------';
SELECT
CONCAT(LEFT(check_name + SPACE(30), 30), error_count) as result
FROM (
SELECT '用户OpenID重复' as check_name, CAST(COUNT(*) - COUNT(DISTINCT open_id) AS VARCHAR) as error_count FROM users
UNION ALL
SELECT '用户UID重复' as check_name, CAST(COUNT(*) - COUNT(DISTINCT uid) AS VARCHAR) as error_count FROM users
UNION ALL
SELECT '订单号重复' as check_name, CAST(COUNT(*) - COUNT(DISTINCT order_no) AS VARCHAR) as error_count FROM orders
UNION ALL
SELECT '订单用户关联异常' as check_name, CAST(COUNT(*) AS VARCHAR) as error_count
FROM orders o LEFT JOIN users u ON o.user_id = u.id WHERE u.id IS NULL
UNION ALL
SELECT '订单商品关联异常' as check_name, CAST(COUNT(*) AS VARCHAR) as error_count
FROM orders o LEFT JOIN goods g ON o.goods_id = g.id WHERE g.id IS NULL
UNION ALL
SELECT '商品奖品关联异常' as check_name, CAST(COUNT(*) AS VARCHAR) as error_count
FROM goods_items gi LEFT JOIN goods g ON gi.goods_id = g.id WHERE g.id IS NULL
) t;
PRINT '';
-- 4. 索引和约束验证
PRINT '4. 索引和约束验证';
SELECT
t.name as table_name,
i.name as index_name,
i.type_desc as index_type,
i.is_unique as is_unique
FROM sys.tables t
JOIN sys.indexes i ON t.object_id = i.object_id
WHERE t.name IN ('users', 'goods', 'orders', 'order_items', 'goods_items')
AND i.name IS NOT NULL
ORDER BY t.name, i.name;
PRINT '';
PRINT '=== 数据验证完成 ===';
6.3 应用程序部署 (0.1天)
任务描述
部署.NET 8应用程序到生产环境
具体工作
- 应用程序编译和打包
- 生产环境部署
- 配置文件更新
- 服务启动和验证
部署脚本
1. 应用程序部署脚本
# Deploy-Production.ps1
# 生产环境应用程序部署脚本
param(
[string]$SourcePath = ".\publish",
[string]$TargetPath = "C:\inetpub\wwwroot\honeybox-api",
[string]$ServiceName = "HoneyBoxAPI",
[string]$BackupPath = "D:\Backup\Deployment",
[switch]$SkipBackup = $false
)
$ErrorActionPreference = "Stop"
$StartTime = Get-Date
Write-Host "=== 生产环境应用程序部署开始 ===" -ForegroundColor Green
Write-Host "开始时间: $StartTime"
Write-Host "源路径: $SourcePath"
Write-Host "目标路径: $TargetPath"
Write-Host ""
# 1. 创建备份
if (-not $SkipBackup -and (Test-Path $TargetPath)) {
Write-Host "创建当前版本备份..." -ForegroundColor Yellow
$BackupDir = Join-Path $BackupPath (Get-Date -Format "yyyyMMdd_HHmmss")
New-Item -ItemType Directory -Path $BackupDir -Force | Out-Null
Copy-Item -Path $TargetPath -Destination $BackupDir -Recurse -Force
Write-Host "备份完成: $BackupDir" -ForegroundColor Green
}
# 2. 停止应用程序服务
Write-Host "停止应用程序服务..." -ForegroundColor Yellow
try {
Stop-Service -Name $ServiceName -Force -ErrorAction SilentlyContinue
Start-Sleep -Seconds 5
Write-Host "服务已停止" -ForegroundColor Green
} catch {
Write-Warning "停止服务时出现警告: $($_.Exception.Message)"
}
# 3. 部署新版本
Write-Host "部署新版本..." -ForegroundColor Yellow
if (Test-Path $TargetPath) {
Remove-Item -Path $TargetPath -Recurse -Force
}
New-Item -ItemType Directory -Path $TargetPath -Force | Out-Null
Copy-Item -Path "$SourcePath\*" -Destination $TargetPath -Recurse -Force
Write-Host "文件部署完成" -ForegroundColor Green
# 4. 更新配置文件
Write-Host "更新配置文件..." -ForegroundColor Yellow
$configFile = Join-Path $TargetPath "appsettings.Production.json"
if (Test-Path $configFile) {
Write-Host "生产配置文件已存在: $configFile" -ForegroundColor Green
} else {
Write-Warning "生产配置文件不存在,请检查配置"
}
# 5. 设置文件权限
Write-Host "设置文件权限..." -ForegroundColor Yellow
$acl = Get-Acl $TargetPath
$accessRule = New-Object System.Security.AccessControl.FileSystemAccessRule("IIS_IUSRS", "FullControl", "ContainerInherit,ObjectInherit", "None", "Allow")
$acl.SetAccessRule($accessRule)
Set-Acl -Path $TargetPath -AclObject $acl
Write-Host "权限设置完成" -ForegroundColor Green
# 6. 启动应用程序服务
Write-Host "启动应用程序服务..." -ForegroundColor Yellow
try {
Start-Service -Name $ServiceName
Start-Sleep -Seconds 10
$service = Get-Service -Name $ServiceName
if ($service.Status -eq "Running") {
Write-Host "服务启动成功" -ForegroundColor Green
} else {
throw "服务启动失败,状态: $($service.Status)"
}
} catch {
Write-Error "启动服务失败: $($_.Exception.Message)"
}
# 7. 健康检查
Write-Host "执行健康检查..." -ForegroundColor Yellow
$healthCheckUrl = "https://api.honeybox.com/health"
$maxRetries = 5
$retryCount = 0
do {
try {
$response = Invoke-WebRequest -Uri $healthCheckUrl -TimeoutSec 30
if ($response.StatusCode -eq 200) {
Write-Host "健康检查通过" -ForegroundColor Green
break
}
} catch {
$retryCount++
if ($retryCount -lt $maxRetries) {
Write-Host "健康检查失败,等待重试... ($retryCount/$maxRetries)" -ForegroundColor Yellow
Start-Sleep -Seconds 10
} else {
Write-Error "健康检查失败,已达到最大重试次数"
}
}
} while ($retryCount -lt $maxRetries)
$EndTime = Get-Date
$Duration = $EndTime - $StartTime
Write-Host ""
Write-Host "=== 生产环境应用程序部署完成 ===" -ForegroundColor Green
Write-Host "结束时间: $EndTime"
Write-Host "总耗时: $($Duration.ToString('mm\:ss'))"
6.4 监控和告警配置 (0.1天)
任务描述
配置系统监控和告警机制
具体工作
- 数据库性能监控
- 应用程序监控
- 健康检查配置
- 告警规则设置
监控配置
1. 数据库监控脚本
-- DatabaseMonitoring.sql
-- 数据库监控查询脚本
-- 1. 数据库基本信息监控
SELECT
'Database Info' as monitor_type,
DB_NAME() as database_name,
GETDATE() as check_time,
(SELECT COUNT(*) FROM users WHERE status = 1) as active_users,
(SELECT COUNT(*) FROM goods WHERE status = 1) as active_goods,
(SELECT COUNT(*) FROM orders WHERE created_at >= DATEADD(DAY, -1, GETDATE())) as orders_last_24h,
(SELECT COUNT(*) FROM orders WHERE status = 1 AND created_at >= DATEADD(DAY, -1, GETDATE())) as paid_orders_last_24h;
-- 2. 性能监控
SELECT
'Performance' as monitor_type,
GETDATE() as check_time,
(SELECT cntr_value FROM sys.dm_os_performance_counters WHERE counter_name = 'Batch Requests/sec') as batch_requests_per_sec,
(SELECT cntr_value FROM sys.dm_os_performance_counters WHERE counter_name = 'SQL Compilations/sec') as compilations_per_sec,
(SELECT cntr_value FROM sys.dm_os_performance_counters WHERE counter_name = 'SQL Re-Compilations/sec') as recompilations_per_sec,
(SELECT cntr_value FROM sys.dm_os_performance_counters WHERE counter_name = 'User Connections') as user_connections;
-- 3. 等待统计监控
SELECT TOP 10
'Wait Stats' as monitor_type,
wait_type,
waiting_tasks_count,
wait_time_ms,
max_wait_time_ms,
signal_wait_time_ms,
GETDATE() as check_time
FROM sys.dm_os_wait_stats
WHERE wait_type NOT IN (
'CLR_SEMAPHORE', 'LAZYWRITER_SLEEP', 'RESOURCE_QUEUE', 'SLEEP_TASK',
'SLEEP_SYSTEMTASK', 'SQLTRACE_BUFFER_FLUSH', 'WAITFOR', 'LOGMGR_QUEUE',
'CHECKPOINT_QUEUE', 'REQUEST_FOR_DEADLOCK_SEARCH', 'XE_TIMER_EVENT',
'BROKER_TO_FLUSH', 'BROKER_TASK_STOP', 'CLR_MANUAL_EVENT', 'CLR_AUTO_EVENT'
)
ORDER BY wait_time_ms DESC;
-- 4. 阻塞监控
SELECT
'Blocking' as monitor_type,
blocking_session_id,
session_id,
wait_type,
wait_time,
wait_resource,
GETDATE() as check_time
FROM sys.dm_exec_requests
WHERE blocking_session_id <> 0;
-- 5. 索引使用情况监控
SELECT TOP 10
'Index Usage' as monitor_type,
OBJECT_NAME(s.object_id) as table_name,
i.name as index_name,
s.user_seeks,
s.user_scans,
s.user_lookups,
s.user_updates,
GETDATE() as check_time
FROM sys.dm_db_index_usage_stats s
JOIN sys.indexes i ON s.object_id = i.object_id AND s.index_id = i.index_id
WHERE s.database_id = DB_ID()
AND OBJECT_NAME(s.object_id) IN ('users', 'goods', 'orders', 'order_items', 'goods_items')
ORDER BY s.user_seeks + s.user_scans + s.user_lookups DESC;
2. 应用程序健康检查
// HealthChecks/DatabaseHealthCheck.cs
using Microsoft.Extensions.Diagnostics.HealthChecks;
using Microsoft.EntityFrameworkCore;
using HoneyBox.Data;
namespace HoneyBox.Api.HealthChecks
{
public class DatabaseHealthCheck : IHealthCheck
{
private readonly HoneyBoxDbContext _context;
private readonly ILogger<DatabaseHealthCheck> _logger;
public DatabaseHealthCheck(HoneyBoxDbContext context, ILogger<DatabaseHealthCheck> logger)
{
_context = context;
_logger = logger;
}
public async Task<HealthCheckResult> CheckHealthAsync(
HealthCheckContext context,
CancellationToken cancellationToken = default)
{
try
{
// 检查数据库连接
await _context.Database.CanConnectAsync(cancellationToken);
// 检查关键表是否可访问
var userCount = await _context.Users.CountAsync(cancellationToken);
var goodsCount = await _context.Goods.CountAsync(cancellationToken);
// 检查最近的订单活动
var recentOrders = await _context.Orders
.Where(o => o.CreatedAt >= DateTime.Now.AddHours(-1))
.CountAsync(cancellationToken);
var data = new Dictionary<string, object>
{
["user_count"] = userCount,
["goods_count"] = goodsCount,
["recent_orders"] = recentOrders,
["check_time"] = DateTime.Now
};
return HealthCheckResult.Healthy("数据库连接正常", data);
}
catch (Exception ex)
{
_logger.LogError(ex, "数据库健康检查失败");
return HealthCheckResult.Unhealthy("数据库连接失败", ex);
}
}
}
public class CacheHealthCheck : IHealthCheck
{
private readonly ICacheService _cacheService;
private readonly ILogger<CacheHealthCheck> _logger;
public CacheHealthCheck(ICacheService cacheService, ILogger<CacheHealthCheck> logger)
{
_cacheService = cacheService;
_logger = logger;
}
public async Task<HealthCheckResult> CheckHealthAsync(
HealthCheckContext context,
CancellationToken cancellationToken = default)
{
try
{
var testKey = "health_check_" + Guid.NewGuid();
var testValue = DateTime.Now.ToString();
// 测试缓存写入
await _cacheService.SetAsync(testKey, testValue, TimeSpan.FromMinutes(1));
// 测试缓存读取
var cachedValue = await _cacheService.GetAsync<string>(testKey);
// 清理测试数据
await _cacheService.RemoveAsync(testKey);
if (cachedValue == testValue)
{
return HealthCheckResult.Healthy("缓存服务正常");
}
else
{
return HealthCheckResult.Degraded("缓存读写不一致");
}
}
catch (Exception ex)
{
_logger.LogError(ex, "缓存健康检查失败");
return HealthCheckResult.Unhealthy("缓存服务异常", ex);
}
}
}
}
// Program.cs 中的健康检查配置
builder.Services.AddHealthChecks()
.AddCheck<DatabaseHealthCheck>("database")
.AddCheck<CacheHealthCheck>("cache")
.AddDbContextCheck<HoneyBoxDbContext>("ef_context");
// 健康检查端点配置
app.MapHealthChecks("/health", new HealthCheckOptions
{
ResponseWriter = async (context, report) =>
{
context.Response.ContentType = "application/json";
var response = new
{
status = report.Status.ToString(),
checks = report.Entries.Select(x => new
{
name = x.Key,
status = x.Value.Status.ToString(),
exception = x.Value.Exception?.Message,
duration = x.Value.Duration.ToString(),
data = x.Value.Data
}),
totalDuration = report.TotalDuration.ToString()
};
await context.Response.WriteAsync(JsonSerializer.Serialize(response));
}
});
app.MapHealthChecks("/health/ready", new HealthCheckOptions
{
Predicate = check => check.Tags.Contains("ready")
});
app.MapHealthChecks("/health/live", new HealthCheckOptions
{
Predicate = _ => false
});
3. 告警配置脚本
# AlertingSetup.ps1
# 告警配置脚本
# 1. 数据库性能告警
$DatabaseAlerts = @(
@{
Name = "数据库CPU使用率过高"
Query = "SELECT AVG(cntr_value) FROM sys.dm_os_performance_counters WHERE counter_name = 'Processor Time %'"
Threshold = 80
Severity = "Warning"
},
@{
Name = "数据库连接数过多"
Query = "SELECT cntr_value FROM sys.dm_os_performance_counters WHERE counter_name = 'User Connections'"
Threshold = 100
Severity = "Critical"
},
@{
Name = "阻塞查询检测"
Query = "SELECT COUNT(*) FROM sys.dm_exec_requests WHERE blocking_session_id <> 0"
Threshold = 0
Severity = "Warning"
},
@{
Name = "死锁检测"
Query = "SELECT cntr_value FROM sys.dm_os_performance_counters WHERE counter_name = 'Number of Deadlocks/sec'"
Threshold = 0
Severity = "Critical"
}
)
# 2. 应用程序告警
$ApplicationAlerts = @(
@{
Name = "API响应时间过长"
Endpoint = "https://api.honeybox.com/health"
MaxResponseTime = 5000
Severity = "Warning"
},
@{
Name = "API不可用"
Endpoint = "https://api.honeybox.com/health"
ExpectedStatusCode = 200
Severity = "Critical"
},
@{
Name = "内存使用率过高"
ProcessName = "HoneyBoxAPI"
MaxMemoryMB = 2048
Severity = "Warning"
}
)
# 3. 业务指标告警
$BusinessAlerts = @(
@{
Name = "订单量异常下降"
Query = "SELECT COUNT(*) FROM orders WHERE created_at >= DATEADD(HOUR, -1, GETDATE())"
MinThreshold = 5
Severity = "Warning"
},
@{
Name = "用户登录异常"
Query = "SELECT COUNT(*) FROM user_login_logs WHERE login_time >= DATEADD(HOUR, -1, GETDATE())"
MinThreshold = 10
Severity = "Warning"
},
@{
Name = "抽奖失败率过高"
Query = "SELECT CAST(SUM(CASE WHEN status = 0 THEN 1 ELSE 0 END) AS FLOAT) / COUNT(*) * 100 FROM orders WHERE created_at >= DATEADD(HOUR, -1, GETDATE())"
MaxThreshold = 5.0
Severity = "Critical"
}
)
Write-Host "告警规则配置完成" -ForegroundColor Green
Write-Host "数据库告警: $($DatabaseAlerts.Count) 条"
Write-Host "应用程序告警: $($ApplicationAlerts.Count) 条"
Write-Host "业务指标告警: $($BusinessAlerts.Count) 条"
验收标准
生产环境验收
- SQL Server生产环境配置完成
- 生产数据库创建和权限配置正确
- 应用程序配置文件更新完成
- SSL证书和安全配置完成
数据迁移验收
- 生产数据备份完成
- 表结构创建成功
- 数据迁移执行成功
- 数据验证通过,记录数和业务数据一致
应用部署验收
- 应用程序部署成功
- 服务启动正常
- 健康检查通过
- API接口响应正常
监控告警验收
- 数据库监控配置完成
- 应用程序健康检查配置完成
- 告警规则设置完成
- 监控数据正常收集
风险点和注意事项
部署风险
- 数据丢失: 迁移过程中可能出现数据丢失
- 服务中断: 部署过程中服务不可用
- 配置错误: 生产环境配置可能有误
- 性能问题: 生产环境性能可能不如预期
监控风险
- 告警风暴: 告警规则设置不当可能产生大量告警
- 监控盲点: 某些关键指标可能未被监控
- 误报: 告警阈值设置不合理导致误报
- 延迟: 监控数据收集和告警可能有延迟
解决方案
- 充分备份: 在迁移前创建完整备份
- 分步部署: 采用蓝绿部署或滚动部署
- 回滚准备: 准备快速回滚方案
- 渐进式监控: 逐步完善监控和告警规则
运维文档
日常运维检查清单
- 检查数据库连接状态
- 检查应用程序服务状态
- 检查磁盘空间使用情况
- 检查内存使用情况
- 检查错误日志
- 检查性能指标
- 检查备份状态
故障处理流程
- 故障发现: 通过监控告警或用户反馈
- 故障确认: 确认故障范围和影响
- 紧急处理: 采取临时措施恢复服务
- 根因分析: 分析故障原因
- 永久修复: 实施永久解决方案
- 总结改进: 总结经验,改进流程
联系信息
- 开发团队: dev-team@company.com
- 运维团队: ops-team@company.com
- DBA: dba@company.com
- 紧急联系: emergency@company.com
阶段6完成标志: 生产环境部署成功,数据迁移完成,监控告警配置完成,系统稳定运行,数据库迁移项目圆满完成。