在使用 Azure 资源管理器部署 Service Fabric 群集时,我不断看到超时。我看到下面的错误,大概有 20% 的时间。重新部署相同的配置将解决问题。
New-AzureRmResourceGroupDeployment : 6:54:02 AM - Resource Microsoft.Compute/virtualMachineScaleSets 'nodeType' failed
with message '{
"status": "Failed",
"error": {
"code": "ResourceDeploymentFailure",
"message": "The resource operation completed with terminal provisioning state 'Failed'.",
"details": [
{
"code": "VMExtensionHandlerNonTransientError",
"message": "Handler 'Microsoft.Azure.ServiceFabric.ServiceFabricNode' has reported failure for VM Extension
'nodeType_ServiceFabricNode' with terminal error code '1009' and error message: 'Enable failed for plugin (name:
Microsoft.Azure.ServiceFabric.ServiceFabricNode, version 1.0.0.35) with exception Command
C:\\Packages\\Plugins\\Microsoft.Azure.ServiceFabric.ServiceFabricNode\\1.0.0.35\\ServiceFabricExtensionHandler.exe of
Microsoft.Azure.ServiceFabric.ServiceFabricNode has not exited on time! Killing it...'"
}
]
}
}'
At C:\BuildAgent\work\d851d22c9abed7b9\Core\Core\scripts\Provision\ProvisionGeneric.ps1:39 char:1
+ New-AzureRmResourceGroupDeployment -Verbose -ResourceGroupName $resou ...
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ CategoryInfo : NotSpecified: (:) [New-AzureRmResourceGroupDeployment], Exception
+ FullyQualifiedErrorId : Microsoft.Azure.Commands.ResourceManager.Cmdlets.Implementation.NewAzureResourceGroupDep
loymentCmdlet
New-AzureRmResourceGroupDeployment : 6:54:02 AM - Handler 'Microsoft.Azure.ServiceFabric.ServiceFabricNode' has
reported failure for VM Extension 'nodeType_ServiceFabricNode' with terminal error code '1009' and error message: 'Enable
failed for plugin (name: Microsoft.Azure.ServiceFabric.ServiceFabricNode, version 1.0.0.35) with exception Command
C:\Packages\Plugins\Microsoft.Azure.ServiceFabric.ServiceFabricNode\1.0.0.35\ServiceFabricExtensionHandler.exe of
Microsoft.Azure.ServiceFabric.ServiceFabricNode has not exited on time! Killing it...'
At C:\BuildAgent\work\d851d22c9abed7b9\Core\Core\scripts\Provision\ProvisionGeneric.ps1:39 char:1
+ New-AzureRmResourceGroupDeployment -Verbose -ResourceGroupName $resou ...
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ CategoryInfo : NotSpecified: (:) [New-AzureRmResourceGroupDeployment], Exception
+ FullyQualifiedErrorId : Microsoft.Azure.Commands.ResourceManager.Cmdlets.Implementation.NewAzureResourceGroupDep
loymentCmdlet
Azure RM 模板的相关部分如下所示:
{
"apiVersion": "[variables('vmssApiVersion')]",
"type": "Microsoft.Compute/virtualMachineScaleSets",
"name": "[parameters('vmNodeType0Name')]",
"location": "[parameters('computeLocation')]",
"dependsOn": ["[concat('Microsoft.Network/virtualNetworks/', parameters('virtualNetworkName'))]",
"[concat('Microsoft.Storage/storageAccounts/', variables('uniqueStringArray0')[0])]",
"[concat('Microsoft.Storage/storageAccounts/', variables('uniqueStringArray0')[1])]",
"[concat('Microsoft.Storage/storageAccounts/', variables('uniqueStringArray0')[2])]",
"[concat('Microsoft.Storage/storageAccounts/', variables('uniqueStringArray0')[3])]",
"[concat('Microsoft.Storage/storageAccounts/', variables('uniqueStringArray0')[4])]",
"[concat('Microsoft.Network/loadBalancers/', concat('INT_LB','-', parameters('clusterName'),'-',parameters('vmNodeType0Name')))]",
"[concat('Microsoft.Network/loadBalancers/', concat('EXT_LB','-', parameters('clusterName'),'-',parameters('vmNodeType0Name')))]",
"[concat('Microsoft.Storage/storageAccounts/', parameters('supportLogStorageAccountName'))]",
"[concat('Microsoft.Storage/storageAccounts/', parameters('applicationDiagnosticsStorageAccountName'))]"],
"properties": {
"overprovision": "[parameters('overProvision')]",
"upgradePolicy": {
"mode": "Automatic"
},
"virtualMachineProfile": {
"extensionProfile": {
"extensions": [{
"name": "[concat(parameters('vmNodeType0Name'),'_ServiceFabricNode')]",
"properties": {
"type": "ServiceFabricNode",
"autoUpgradeMinorVersion": false,
"protectedSettings": {
"StorageAccountKey1": "[listKeys(resourceId('Microsoft.Storage/storageAccounts', parameters('supportLogStorageAccountName')),'2015-05-01-preview').key1]",
"StorageAccountKey2": "[listKeys(resourceId('Microsoft.Storage/storageAccounts', parameters('supportLogStorageAccountName')),'2015-05-01-preview').key2]"
},
"publisher": "Microsoft.Azure.ServiceFabric",
"settings": {
"clusterEndpoint": "[reference(parameters('clusterName')).clusterEndpoint]",
"nodeTypeRef": "[parameters('vmNodeType0Name')]",
"dataPath": "D:\\\\SvcFab",
"durabilityLevel": "Bronze",
"certificate": {
"thumbprint": "[parameters('clusterSecurityCertThumbprint')]",
"x509StoreName": "my"
}
},
"typeHandlerVersion": "1.0"
}
},
{
"name": "[concat('VMDiagnosticsVmExt','_vmNodeType0Name')]",
"properties": {
"type": "IaaSDiagnostics",
"autoUpgradeMinorVersion": true,
"protectedSettings": {
"storageAccountName": "[parameters('applicationDiagnosticsStorageAccountName')]",
"storageAccountKey": "[listKeys(resourceId('Microsoft.Storage/storageAccounts', parameters('applicationDiagnosticsStorageAccountName')),'2015-05-01-preview').key1]",
"storageAccountEndPoint": "https://core.windows.net/"
},
"publisher": "Microsoft.Azure.Diagnostics",
"settings": {
"WadCfg": {
"DiagnosticMonitorConfiguration": {
"overallQuotaInMB": "50000",
"sinks": "ApplicationInsights",
"DiagnosticInfrastructureLogs": {
"scheduledTransferLogLevelFilter": "Error"
},
"EtwProviders": {
"EtwEventSourceProviderConfiguration": [{
"provider": "Microsoft-ServiceFabric-Actors",
"scheduledTransferKeywordFilter": "1",
"scheduledTransferPeriod": "PT1M",
"DefaultEvents": {
"eventDestination": "ServiceFabricReliableActorEvents"
}
},
{
"provider": "Microsoft-ServiceFabric-Services",
"scheduledTransferPeriod": "PT1M",
"DefaultEvents": {
"eventDestination": "ServiceFabricReliableServiceEvents"
}
}],
"EtwManifestProviderConfiguration": [{
"provider": "cbd93bc2-71e5-4566-b3a7-595d8eeca6e8",
"scheduledTransferLogLevelFilter": "Information",
"scheduledTransferKeywordFilter": "4611686018427387904",
"scheduledTransferPeriod": "PT1M",
"DefaultEvents": {
"eventDestination": "ServiceFabricSystemEventTable"
}
}]
}
},
"SinksConfig": {
"Sink": [{
"name": "ApplicationInsights",
"ApplicationInsights": "[parameters('appInsightsKey')]",
"Channels": {
"Channel": [{
"logLevel": "Error",
"name": "Errors"
},
{
"logLevel": "Verbose",
"name": "AppLogs"
}]
}
}]
}
},
"StorageAccount": "[parameters('applicationDiagnosticsStorageAccountName')]"
},
"typeHandlerVersion": "1.5"
}
}]
},
"networkProfile": {
"networkInterfaceConfigurations": [{
"name": "[concat(parameters('nicName'), '-0')]",
"properties": {
"ipConfigurations": [{
"name": "[concat(parameters('nicName'),'-',0)]",
"properties": {
"loadBalancerBackendAddressPools": [{
"id": "[variables('lbIntPoolId')]"
},
{
"id": "[variables('lbExtPoolId')]"
}],
"subnet": {
"id": "[variables('subnet0Ref')]"
}
}
}],
"primary": true
}
}]
},
"osProfile": {
"adminPassword": "[parameters('adminPassword')]",
"adminUsername": "[parameters('adminUsername')]",
"computernamePrefix": "[parameters('vmNodeType0Name')]",
"secrets": [{
"sourceVault": {
"id": "[parameters('keyVaultName')]"
},
"vaultCertificates": [{
"certificateStore": "my",
"certificateUrl": "[parameters('encyphermentCertId')]"
},
{
"certificateStore": "my",
"certificateUrl": "[parameters('identityServerSigningCertId')]"
},
{
"certificateStore": "my",
"certificateUrl": "[parameters('clusterSecurityCertId')]"
},
{
"certificateStore": "My",
"certificateUrl": "[parameters('adminCertId')]"
},
{
"certificateStore": "CertificateAuthority",
"certificateUrl": "[parameters('clusterSecurityCertId')]"
}]
}]
},
"storageProfile": {
"imageReference": {
"publisher": "[parameters('vmImagePublisher')]",
"offer": "[parameters('vmImageOffer')]",
"sku": "[parameters('vmImageSku')]",
"version": "[parameters('vmImageVersion')]"
},
"osDisk": {
"vhdContainers": ["[concat('https://', variables('uniqueStringArray0')[0], '.blob.core.windows.net/', parameters('vmStorageAccountContainerName'))]",
"[concat('https://', variables('uniqueStringArray0')[1], '.blob.core.windows.net/', parameters('vmStorageAccountContainerName'))]",
"[concat('https://', variables('uniqueStringArray0')[2], '.blob.core.windows.net/', parameters('vmStorageAccountContainerName'))]",
"[concat('https://', variables('uniqueStringArray0')[3], '.blob.core.windows.net/', parameters('vmStorageAccountContainerName'))]",
"[concat('https://', variables('uniqueStringArray0')[4], '.blob.core.windows.net/', parameters('vmStorageAccountContainerName'))]"],
"name": "vmssosdisk",
"caching": "ReadOnly",
"createOption": "FromImage"
}
}
}
},
"sku": {
"name": "[parameters('vmNodeType0Size')]",
"capacity": "[parameters('vmNodeType0Count')]",
"tier": "Standard"
},
"tags": {
"resourceType": "Service Fabric",
"clusterName": "[parameters('clusterName')]"
}
},
它似乎发生在早上比任何其他时间都多。这导致我们的 CI 构建经常失败。我该如何诊断这个问题?如果这只是我们应该预料到的,那么捕获错误以强制重新部署的最佳方法是什么?