|
#-----------------------------------------------------------------------------
# This script will check that Kiln is running and healthy by hitting a variety of HTTP endpoints
#
# Usage example:
# ./Monitor-Kiln.ps1 -kilnURL "http://localhost/fogbugz/kiln"
#
#-----------------------------------------------------------------------------
param([string]$kilnURL = "http://localhost/fogbugz/kiln",
[string]$smtpServer = "",
[string]$from = "",
[string]$to = "",
[string]$backendVersionUrl = "http://localhost:56783/version",
[string]$esUrl = "http://localhost:9200/",
[string]$queueStatsUrl = "http://localhost:56785/stats.json",
[string]$iisSiteName = "",
[switch]$noRestart,
[string]$reenqueuePath = ".\reenqueue_2.9_tasks.ps1")
############################# FUNCTION DEFINITIONS #############################
function Write-WithTime($msg, $error=$false){
if ($error)
{
write-host "$(Get-Date -format o) $msg" -foregroundcolor "red"
return
}
write-host "$(Get-Date -format o) $msg"
}
function Handle-Error($msg)
{
Write-WithTime " ERROR!" $true
if ($smtpServer -and $from -and $to)
{
Send-MailMessage -From $from -To $to -SmtpServer $smtpServer -Subject "Kiln Server Error Report!" -Body ""
}
else
{
Write-WithTime " $msg" $true
}
}
function Test-ProcessRunning($processName, $count = 1, [ref]$serviceNeedsRestart)
{
Write-WithTime "Checking that $count instance(s) of $processName are running..."
$result = (Get-Process $processName -ea SilentlyContinue | select id | measure).count -ge $count
if (!($result))
{
Handle-Error("Critical Kiln Server error. Process `'$processName`' could not be found!")
$serviceNeedsRestart.value = $true
}
}
function Test-ElasticSearchRunning([ref]$serviceNeedsRestart)
{
Write-WithTime "Checking that 1 instances(s) of ElasticSearch are running..."
$result = $false
(Get-WmiObject win32_process -Filter "name like 'java.exe'") | ForEach-Object {
if (($_.CommandLine | select-string "ElasticSearch" | measure).count -eq 1) {
$result = $true
}
}
if (!($result))
{
Handle-Error("Critical Kiln Server error. Elastic Search is not running!")
$serviceNeedsRestart.value = $true
}
}
function Test-HttpResponse($url, $searchString, [ref]$serviceNeedsRestart)
{
Write-WithTime "Checking for expected response from URL ($url)..."
try
{
$response = (New-Object net.webclient).DownloadString($url)
$response = (New-Object net.webclient).DownloadString($url)
$result = $response | select-string $searchString
if (!($result))
{
Handle-Error("Critical Kiln Server error. URL [$url] did not contain the expected string `'$searchString`'")
$serviceNeedsRestart.value = $true
}
}
catch [Net.WebException]
{
$exceptionMessage = $_.Exception.Message
Handle-Error("Critical Kiln Server error. Could not reach URL [$url]`n$exceptionMessage")
$serviceNeedsRestart = $true
}
}
function Test-QueueStats([ref]$serviceNeedsRestart)
{
try
{
$response = (New-Object System.Net.WebClient).DownloadString($queueStatsUrl)
$stats = Convert-JsonToXml($response)
$statsTime = [long]$stats.root.currentTime."#text"
$timeDiff = (((Get-Date).ToUniversalTime().Ticks / 10e6) - $statsTime)
$queueLength = [int]$stats.root.queueLength."#text"
$runningTaskCount = $stats.root.runningTasks.ChildNodes.count
$oldestTaskAge = [int]$stats.root.oldestRunningTaskTotalSeconds."#text"
if (($timeDiff -gt 5) -or ($timeDiff -lt 5))
{
$serviceNeedsRestart = $true
}
# This is a warning sign, so just alert, but don't attempt to restart the service yet.
if ($queueLength -gt 50)
{
Handle-Error("The Kiln Queue is exceptionally long ($queueLength tasks), this might by a symptom of a recent large change or a problem.")
}
# This is a warning sign, so just alert, but don't attempt to restart the service yet.
if ($runningTaskCount -gt 16)
{
Handle-Error("Too many tasks ($runningTaskCount) are currently running in the Kiln Queue.")
}
# Longer than 15 minutes on a task? This is a bad sign, time to give it some help!
if ( $oldestTaskAge -gt (15*60) )
{
Handle-Error("Oldest task has been running for $oldestTaskAge seconds!")
if (test-path $reenqueuePath)
{
Start-Process $reenqueuePath "-retryRunning"
}
}
}
catch [Exception]
{
$exceptionMessage = $_.Exception.Message
Handle-Error("Error obtaining or processing Kiln Queue Stats`n$exceptionMessage")
$serviceNeedsRestart = $true
}
}
# This is provided as an alternative to ConvertFrom-JSON, which requires Powershell V3.
# Powershell V3 is not on most servers by default and requires a reboot to install, so here we are instead...
# Source: (https://www.cogmotive.com/blog/powershell/parsing-json-in-powershell-xml-the-member-item-is-already-present)
Add-Type -Assembly System.ServiceModel.Web,System.Runtime.Serialization
function Convert-JsonToXml([string]$json)
{
$bytes = [byte[]][char[]]$json
$quotas = [System.Xml.XmlDictionaryReaderQuotas]::Max
$jsonReader = [System.Runtime.Serialization.Json.JsonReaderWriterFactory]::CreateJsonReader($bytes,$quotas)
try
{
$xml = new-object System.Xml.XmlDocument
$xml.Load($jsonReader)
$xml
}
finally
{
$jsonReader.Close()
}
}
########################### END FUNCTION DEFINITIONS ###########################
$tryAutoRestart = !($noRestart) # Needed a negative name for the parameter since the default is true. Sanity rename here.
$kssNeedsRestart = $false
$kqsNeedsRestart = $false
$iisNeedsRestart = $false
Test-ProcessRunning "backend" 1 -serviceNeedsRestart ([ref]$kssNeedsRestart)
Test-ProcessRunning "redis-server" 2 -serviceNeedsRestart ([ref]$kssNeedsRestart)
Test-ElasticSearchRunning -serviceNeedsRestart ([ref]$kssNeedsRestart)
Test-ProcessRunning "QueueService" -serviceNeedsRestart ([ref]$kqsNeedsRestart)
Test-HttpResponse $kilnURL "Log on to Kiln" -serviceNeedsRestart ([ref]$iisNeedsRestart)
Test-HttpResponse $backendVersionUrl '"hg_version"' -serviceNeedsRestart ([ref]$kssNeedsRestart)
Test-HttpResponse $esUrl '"status" : 200' -serviceNeedsRestart ([ref]$kssNeedsRestart)
Test-HttpResponse $queueStatsUrl '"queueName":"Kiln"' -serviceNeedsRestart ([ref]$kqsNeedsRestart)
Test-QueueStats -serviceNeedsRestart ([ref]$kqsNeedsRestart)
Write-Host "`n"
if ($iisNeedsRestart -and $iisSiteName -and $tryAutoRestart)
{
Write-WithTime "Attemping to restart site $iisSiteName in IIS...`n"
$appcmd = $env:SystemRoot + "\system32\inetsrv\appcmd.exe"
$stopArgs = "stop site /site.name:$iisSiteName"
$startArgs = "start site /site.name:$iisSiteName"
Start-Process $appcmd $stopArgs
Start-Process $appcmd $startArgs
}
if ($kssNeedsRestart -and $tryAutoRestart)
{
Write-WithTime "Attempting to restart Kiln Storage Service..."
# Find and stop the java.exe process that is running Elastic Search
(Get-WmiObject win32_process -Filter "name like 'java.exe'") | ForEach-Object {
if (($_.CommandLine | select-string "ElasticSearch" | measure).count -eq 1) {
Stop-Process -id $_.ProcessId -Force -EV Err -EA "SilentlyContinue"
}
}
# Try to stop the processes, but don't show any errors (it might already be stopped)
# This will stop both of the redis-server.exe processes because the have the same name
Stop-Process -name redis-server -Force -EV Err -EA "SilentlyContinue"
Stop-Process -name backend -Force -EV Err -EA "SilentlyContinue"
Start-Service KilnStorageService
}
if ($kqsNeedsRestart -and $tryAutoRestart)
{
Write-WithTime "Attempting to restart Kiln Queuing Service..."
# Try to stop the process, but don't show any errors (it might already be stopped)
Stop-Process -name QueueService -Force -EV Err -EA "SilentlyContinue"
Start-Sleep -s 1
Start-Service "Kiln Queuing Service"
}
|
Loading...