4

在 Azure 中启用 VM 诊断非常痛苦。我已经使用 ARM 模板、Azure PowerShell SDK 和 Azure CLI 让它工作了。但是我这几天一直在尝试使用 Terraform 和 azurerm_virtual_machine_extension 资源为 Windows 和 Linux VM 启用 VM 诊断。还是不行,唉!

这是我到目前为止所拥有的(我已经对此进行了一些调整以简化这篇文章,所以希望我的手动编辑没有破坏任何东西):

  resource "azurerm_virtual_machine_extension" "vm-linux" {
  count                      = "${local.is_windows_vm == "false" ? 1 : 0}"
  depends_on                 = ["azurerm_virtual_machine_data_disk_attachment.vm"]
  name                       = "LinuxDiagnostic"
  location                   = "${var.location}"
  resource_group_name        = "${var.resource_group_name}"
  virtual_machine_name       = "${local.vm_name}"
  publisher                  = "Microsoft.Azure.Diagnostics"
  type                       = "LinuxDiagnostic"
  type_handler_version       = "3.0"
  auto_upgrade_minor_version = "true"

  # The JSON file referenced below was created by running "az vm diagnostics get-default-config", and adding/verifying the "__DIAGNOSTIC_STORAGE_ACCOUNT__" and "__VM_RESOURCE_ID__" placeholders.
  settings = <<SETTINGS
    {
      "ladCfg": "${base64encode(replace(replace(file("${path.module}/.diag-settings/linux_diag_config.json"), "__DIAGNOSTIC_STORAGE_ACCOUNT__", "${module.vm_storage_account.name}"), "__VM_RESOURCE_ID__", "${local.metricsresourceid}"))}",
      "storageAccount": "${module.vm_storage_account.name}"
    }
SETTINGS

  # SAS token below: Do not include the leading question mark, as per https://docs.microsoft.com/en-us/azure/virtual-machines/extensions/diagnostics-linux.
  protected_settings = <<SETTINGS
    {
      "storageAccountName": "${module.vm_storage_account.name}",
      "storageAccountSasToken": "${replace(data.azurerm_storage_account_sas.current.sas, "/^\\?/", "")}",
      "storageAccountEndPoint": "https://core.windows.net/"
    }
SETTINGS
}

resource "azurerm_virtual_machine_extension" "vm-win" {
  count                      = "${local.is_windows_vm == "true" ? 1 : 0}"
  depends_on                 = ["azurerm_virtual_machine_data_disk_attachment.vm"]
  name                       = "Microsoft.Insights.VMDiagnosticsSettings"
  location                   = "${var.location}"
  resource_group_name        = "${var.resource_group_name}"
  virtual_machine_name       = "${local.vm_name}"
  publisher                  = "Microsoft.Azure.Diagnostics"
  type                       = "IaaSDiagnostics"
  type_handler_version       = "1.9"
  auto_upgrade_minor_version = "true"

  # The JSON file referenced below was created by running "az vm diagnostics get-default-config --is-windows-os", and adding/verifying the "__DIAGNOSTIC_STORAGE_ACCOUNT__" and "__VM_RESOURCE_ID__" placeholders.
  settings = <<SETTINGS
    {
      "wadCfg": "${base64encode(replace(replace(file("${path.module}/.diag-settings/windows_diag_config.json"), "__DIAGNOSTIC_STORAGE_ACCOUNT__", "${module.vm_storage_account.name}"), "__VM_RESOURCE_ID__", "${local.metricsresourceid}"))}",
      "storageAccount": "${module.vm_storage_account.name}"
    }
SETTINGS

  protected_settings = <<SETTINGS
    {
      "storageAccountName": "${module.vm_storage_account.name}",
      "storageAccountSasToken": "${data.azurerm_storage_account_sas.current.sas}",
      "storageAccountEndPoint": "https://core.windows.net/"
    }
SETTINGS
}

请注意,对于 Linux 和 Windows,我根据注释从代码库中的 JSON 文件加载诊断详细信息。这些是 Azure 提供的默认配置,因此它们应该是有效的。

当我部署这些时,Linux VM 扩展部署成功,但在 Azure 门户中,扩展显示“在生成的 mdsd 配置中检测到问题”。如果我查看 VM 的“诊断设置”,它会显示“遇到错误:TypeError:对象不支持属性或方法 'diagnosticMonitorConfiguration'”。Windows VM 扩展无法完全部署,说它“无法读取配置”。如果我在门户中查看扩展程序,则会显示以下错误:

"code": "ComponentStatus//failed/-3",
"level": "Error",
"displayStatus": "Provisioning failed",
"message": "Error starting the diagnostics extension"

如果我查看“诊断设置”窗格,它只会挂着一个永无止境的“......” 动画。

但是,如果我查看两个 VM 扩展的“terraform apply”输出,解码后的设置看起来完全符合预期,将配置文件与正确替换的占位符匹配。

关于如何使它工作的任何建议?

提前致谢!

4

3 回答 3

2

到目前为止,我已经让 Windows 诊断程序在我们的环境中 100% 工作。AzureRM API 似乎对发送的配置非常挑剔。我们一直在使用 powershell 来启用它,并且在 powershell 中使用的相同 xmlCfg 不适用于 terraform。到目前为止,这对我们有用:(settings/protected_settings 名称区分大小写!又名 xmlCfg 有效,而 xmlcfg 无效)

主文件

#########################################################
#  VM Extensions - Windows In-Guest Monitoring/Diagnostics
#########################################################
resource "azurerm_virtual_machine_extension" "InGuestDiagnostics" {
  name                       = var.compute["InGuestDiagnostics"]["name"]
  location                   = azurerm_resource_group.VMResourceGroup.location
  resource_group_name        = azurerm_resource_group.VMResourceGroup.name
  virtual_machine_name       = azurerm_virtual_machine.Compute.name
  publisher                  = var.compute["InGuestDiagnostics"]["publisher"]
  type                       = var.compute["InGuestDiagnostics"]["type"]
  type_handler_version       = var.compute["InGuestDiagnostics"]["type_handler_version"]
  auto_upgrade_minor_version = var.compute["InGuestDiagnostics"]["auto_upgrade_minor_version"]

  settings           = <<SETTINGS
    {
      "xmlCfg": "${base64encode(templatefile("${path.module}/templates/wadcfgxml.tmpl", { vmid = azurerm_virtual_machine.Compute.id }))}",
      "storageAccount": "${data.azurerm_storage_account.InGuestDiagStorageAccount.name}"
    }
SETTINGS
  protected_settings = <<PROTECTEDSETTINGS
    {
      "storageAccountName": "${data.azurerm_storage_account.InGuestDiagStorageAccount.name}",
      "storageAccountKey": "${data.azurerm_storage_account.InGuestDiagStorageAccount.primary_access_key}",
      "storageAccountEndPoint": "https://core.windows.net"
    }
PROTECTEDSETTINGS
}

变量

  InGuestDiagnostics = {
    name                       = "WindowsDiagnostics"
    publisher                  = "Microsoft.Azure.Diagnostics"
    type                       = "IaaSDiagnostics"
    type_handler_version       = "1.16"
    auto_upgrade_minor_version = "true"
  }

wadcfgxml.tmpl(为简洁起见,我删除了一些 Perf 计数器)

<WadCfg>
    <DiagnosticMonitorConfiguration overallQuotaInMB="5120">
        <DiagnosticInfrastructureLogs scheduledTransferLogLevelFilter="Error"/>
        <Metrics resourceId="${vmid}">
            <MetricAggregation scheduledTransferPeriod="PT1H"/>
            <MetricAggregation scheduledTransferPeriod="PT1M"/>
        </Metrics>
        <PerformanceCounters scheduledTransferPeriod="PT1M">
            <PerformanceCounterConfiguration counterSpecifier="\Processor Information(_Total)\% Processor Time" sampleRate="PT60S" unit="Percent" />
            <PerformanceCounterConfiguration counterSpecifier="\Processor Information(_Total)\% Privileged Time" sampleRate="PT60S" unit="Percent" />
            <PerformanceCounterConfiguration counterSpecifier="\Processor Information(_Total)\% User Time" sampleRate="PT60S" unit="Percent" />
            <PerformanceCounterConfiguration counterSpecifier="\Processor Information(_Total)\Processor Frequency" sampleRate="PT60S" unit="Count" />
            <PerformanceCounterConfiguration counterSpecifier="\System\Processes" sampleRate="PT60S" unit="Count" />
            <PerformanceCounterConfiguration counterSpecifier="\SQLServer:SQL Statistics\SQL Re-Compilations/sec" sampleRate="PT60S" unit="Count" />
        </PerformanceCounters>

        <WindowsEventLog scheduledTransferPeriod="PT1M">
            <DataSource name="Application!*[System[(Level = 1 or Level = 2)]]"/>
            <DataSource name="Security!*[System[(Level = 1 or Level = 2)]"/>
            <DataSource name="System!*[System[(Level = 1 or Level = 2)]]"/>
        </WindowsEventLog>
    </DiagnosticMonitorConfiguration>
</WadCfg>

我终于让 Linux In-Guest Diagnostics 工作了 (LAD)。一些值得注意的事实,与 Windows 诊断不同,设置需要以 json 格式传输,没有 base64 编码。此外,LAD 似乎需要带有存储帐户的 SAS 令牌。围绕 AzureRM API 的正常警告对配置很挑剔,并且区分大小写的设置仍然存在。这是到目前为止对我有用的东西..

# Locals
locals {
  env                  = var.workspace[terraform.workspace]
  # Use a set/static time to avoid TF from recreating the SAS token every apply, which would then cause it to
  # modify/recreate anything that uses it. Not ideal, but the token is for a VERY long time, so it will do for now
  sas_begintime = "2019-11-22T00:00:00Z"
  sas_endtime = timeadd(local.sas_begintime, "873600h")
}

#########################################################
#  VM Extensions - In-Guest Diagnostics
#########################################################
# We need a SAS token for the In-Guest Metrics
data "azurerm_storage_account_sas" "inguestdiagnostics" {
  count             = (contains(keys(local.env), "InGuestDiagnostics") ? 1 : 0)
  connection_string = data.azurerm_storage_account.BootDiagStorageAccount.primary_connection_string
  https_only        = true

  resource_types {
    service   = true
    container = true
    object    = true
  }

  services {
    blob  = true
    queue = true
    table = true
    file  = true
  }

  start  = local.sas_begintime
  expiry = local.sas_endtime

  permissions {
    read    = true
    write   = true
    delete  = true
    list    = true
    add     = true
    create  = true
    update  = true
    process = true
  }
}

resource "azurerm_virtual_machine_extension" "inguestdiagnostics" {
  for_each = contains(keys(local.env), "InGuestDiagnostics") ? local.env["InGuestDiagnostics"] : {}
  depends_on = [azurerm_virtual_machine_extension.dependencyagent]

  name                       = each.value["name"]
  location                   = azurerm_resource_group.resourcegroup.location
  resource_group_name        = azurerm_resource_group.resourcegroup.name
  virtual_machine_name       = azurerm_virtual_machine.compute["${each.key}"].name
  publisher                  = each.value["publisher"]
  type                       = each.value["type"]
  type_handler_version       = each.value["type_handler_version"]
  auto_upgrade_minor_version = each.value["auto_upgrade_minor_version"]

  settings           = templatefile("${path.module}/templates/ladcfg2json.tmpl", { vmid = azurerm_virtual_machine.compute["${each.key}"].id, storageAccountName = data.azurerm_storage_account.BootDiagStorageAccount.name })
  protected_settings = <<PROTECTEDSETTINGS
     {
       "storageAccountName": "${data.azurerm_storage_account.BootDiagStorageAccount.name}",
       "storageAccountSasToken": "${replace(data.azurerm_storage_account_sas.inguestdiagnostics.0.sas, "/^\\?/", "")}"
     }
 PROTECTEDSETTINGS
}
# These variations didn't work for me ..
# "ladCfg": "${templatefile("${path.module}/templates/ladcfgjson.tmpl", { vmid = azurerm_virtual_machine.compute["${each.key}"].id, storageAccountName = data.azurerm_storage_account.BootDiagStorageAccount.name })}",
# - This one get's you Error: "settings" contains an invalid JSON: invalid character '\n' in string literal or Error: "settings" contains an invalid JSON: invalid character 'S' after object key:value pair

# "ladCfg": "${replace(data.local_file.ladcfgjson["${each.key}"].content, "/\\n/", "")}",
# - This one get's you Error: "settings" contains an invalid JSON: invalid character 'S' after object key:value pair

变量

workspace = {
  TerraformWorkSpaceName = {
    compute = {
      # Add additional key/objects for additional Compute
      computer01 = {
        name       = "computer01"
      }
    }
    InGuestDiagnostics = {
      # Add additional key/objects for each Compute you want to install the InGuestDiagnostics on
      computer01 = {
        name                       = "LinuxDiagnostic"
        publisher                  = "Microsoft.Azure.Diagnostics"
        type                       = "LinuxDiagnostic"
        type_handler_version       = "3.0"
        auto_upgrade_minor_version = "true"
      }
    }
  }
}

如果不将整个内容包装在 jsonencode 中,我将无法让模板文件工作。ladcfg2json.tmpl

${jsonencode({
  "StorageAccount": "${storageAccountName}",
  "ladCfg": {
    "sampleRateInSeconds": 15,
    "diagnosticMonitorConfiguration": {
        "metrics": {
            "metricAggregation": [
                {
                    "scheduledTransferPeriod": "PT1M"
                },
                {
                    "scheduledTransferPeriod": "PT1H"
                }
            ],
            "resourceId": "${vmid}"
        },
        "eventVolume": "Medium",
        "performanceCounters": {
            "sinks": "",
            "performanceCounterConfiguration": [
                {
                    "counterSpecifier": "/builtin/processor/percentiowaittime",
                    "condition": "IsAggregate=TRUE",
                    "sampleRate": "PT15S",
                    "annotation": [
                        {
                            "locale": "en-us",
                            "displayName": "CPU IO wait time"
                        }
                    ],
                    "unit": "Percent",
                    "class": "processor",
                    "counter": "percentiowaittime",
                    "type": "builtin"
                }
            ]
        },
        "syslogEvents": {
            "syslogEventConfiguration": {
                "LOG_LOCAL0": "LOG_DEBUG"
            }
        }
    }
  }
})}

我希望这有帮助..

于 2019-12-19T17:16:51.653 回答
1

由于这个问题是一年多前提出的,这对于像我这样第一次尝试这个的人来说更是如此。我们只使用 linux vms,所以这个建议适用于:

  1. 受保护的设置应该使用 PROTECTED_SETTINGS 而不是 SETTINGS(您可以在上面的@rv23 答案中看到)
  2. 从我关注的文档中https://docs.microsoft.com/en-gb/azure/virtual-machines/extensions/diagnostics-linux#protected-settings您可以看到您需要指定 storageAccountSasToken 而不是 storageAccountKey:

这是我编辑的配置版本(用您自己的设置替换所有大写的所有位):

    resource "azurerm_virtual_machine_extension" "vm_linux_diagnostics" {
    count = "1"

    name = "NAME"

        resource_group_name = "YOUR RESOURCE GROUP NAME"
        location            = "YOUR LOCATION"

        virtual_machine_name = "TARGET MACHINE NAME"

        publisher                  = "Microsoft.Azure.Diagnostics"
        type                       = "LinuxDiagnostic"
        type_handler_version       = "3.0"
        auto_upgrade_minor_version = "true"

        settings = <<SETTINGS
        {
            "StorageAccount": "tfnpfsnhsuk",
            "ladCfg": {
                "sampleRateInSeconds": 15,
                "diagnosticMonitorConfiguration": {
                    "metrics": {
                        "metricAggregation": [
                            {
                                "scheduledTransferPeriod": "PT1M"
                            },
                            {
                                "scheduledTransferPeriod": "PT1H"
                            }
                        ],
                        "resourceId": "VM ID"
                    },
                    "eventVolume": "Medium",
                    "performanceCounters": {
                        "sinks": "",
                        .... MORE METRICS - THAT YOU REQUIRE
            }
            }
        }
        SETTINGS

        protected_settings = <<PROTECTED_SETTINGS
        {
            "storageAccountName": "YOUR_ACCOUNT_NAME",
            "storageAccountSasToken": "YOUR SAS TOKEN"
        }
        PROTECTED_SETTINGS

        tags = "YOUR TAG"
        }
于 2020-02-20T11:32:34.683 回答
0

刚刚解决了一个类似的问题:

尝试通过 terraform 添加 LinuxDiagnostic Azure VM Extension 并出现错误

这包括获取 SAS 令牌和读取 json 文件。

于 2020-06-11T14:03:26.970 回答