|
<#
.SYNOPSIS
Checks the vital signs of a Kiln Server
.DESCRIPTION
This script will check a variety of vital signs to see if a Kiln Server is in good health.
When errors are found, the script can write them to std-out, email a message, and/or attempt
to automatically restart the necessary Kiln Services.
.EXAMPLE
./Monitor-Kiln.ps1 -kilnURL "http://localhost/fogbugz/kiln"
This is the most basic check, it will simply perform checks on all Kiln services at their default locations, write
information to std-out, and attempt to automatically restart the Kiln Storage Service and Kiln Queuing
Service when errors are found.
.EXAMPLE
./Monitor-Kiln.ps1 -kilnURL "http://localhost/fogbugz/kiln" -smtpServer smtp.myserver.com -from kiln-error-report@company.com -to kiln-admin@company.com -username bob -password pa55w0rd -smtpSsl
This will perform checks on all Kiln services at their default locations, and attempt to automatically
restart the Kiln Storage Service and Kiln Queuing Service when errors are found. Additionally, any errors
found will generate an email to "kiln-admin@company.com" from "kiln-error-report@company.com" which will be
sent via the smtp server "smtp.myserver.com" using SSL and the username "bob" with password "pa55w0rd" to
authenticate.
.EXAMPLE
./Monitor-Kiln.ps1 -kilnURL "http://localhost/fogbugz/kiln" -iisSiteName Default
This will perform checks on all Kiln services at their default location, write information to std-out,
and attempt to automatically restart the Kiln Storage Service and Kiln Queuing Service when errors are
found. It will also attempt to restart the IIS website named "Default" if the Kiln front page won't load.
.EXAMPLE
./Monitor-Kiln.ps1 -kilnURL "http://localhost/fogbugz/kiln" -noRestart
This will perform checks on all Kiln services at their default location, write information to std-out,
but will not attempt to restart any services.
.PARAMETER kilnURL
The URL to reach the Kiln web page.
.PARAMETER smtpServer
An SMPT server that this script can use to send error messages.
.PARAMETER from
The from address to appear in emailed error messages.
.PARAMETER to
The destination address for emailing error messages.
.PARAMETER username
The username to log into the SMTP server.
.PARAMETER password
The password to log into the SMTP server.
.PARAMETER smtpSsl
A switch to use SSL when connecting to the SMTP server (by default SSL is off).
.PARAMETER smtpPort
The port to connect to the SMTP server on (uses 587 by defualt).
.PARAMETER backendVersionUrl
The URL of the Kiln backend. The default value is almost always correct.
.PARAMETER esUrl
The URL of the ElasticSearch web front end. The default value is almost always correct.
.PARAMETER queueStatsUrl
The URL of the Queue Stats page. The default value is almost always correct.
.PARAMETER iisSiteName
The name of the IIS Website that FogBugz and Kiln are running under. If you provide this,
the script will attempt to restart IIS if it cannot load the Kiln web page.
.PARAMETER noRestart
A flag which when set will force the script to only send alert messages and not
attempt to restart any services automatically.
.PARAMETER reenqueuePath
The path to the Reenqueue script, this is used to try to resuscitate the Queue when problems are detected.
.NOTES
Author: Quentin Schroeder
Date: Dec 13, 2012
#>
param([string]$kilnURL = "http://localhost/fogbugz/kiln",
[string]$smtpServer = "",
[string]$from = "",
[string]$to = "",
[string]$username = "",
[string]$password = "",
[switch]$smtpSsl,
[string]$smtpPort = 587,
[string]$backendVersionUrl = "http://localhost:56783/version",
[string]$esUrl = "http://localhost:9200/",
[string]$queueStatsUrl = "http://localhost:56785/stats.json",
[string]$iisSiteName = "",
[switch]$noRestart,
[string]$reenqueuePath = ".\reenqueue_2.9_tasks.ps1"
)
############################# FUNCTION DEFINITIONS #############################
function Write-WithTime($msg, $error=$false){
if ($error)
{
write-host "$(Get-Date -format o) $msg" -foregroundcolor "red"
return
}
write-host "$(Get-Date -format o) $msg"
}
function Handle-Error($msg)
{
Write-WithTime " ERROR!" $true
if ($smtpServer -and $from -and $to -and $username -and $password)
{
try
{
$SmtpClient = New-Object Net.Mail.SmtpClient($smtpServer, $smtpPor)
$SmtpClient.EnableSsl = $smtpSsl
$SmtpClient.Credentials = New-Object System.Net.NetworkCredential($username, $password);
$SmtpClient.Send($from, $to, "Kiln Server Error Report!", $msg)
}
catch [Exception]
{
$exceptionMessage = $_.Exception.Message
Write-WithTime " Unable to send email, an error occurred.`n$exceptionMessage" $true
}
}
elseif ($smtpServer -or $from -or $to -or $username -or $password -or $smtpSsl -or ($smtpPort -ne 587))
{
Write-WithTime " Unable to send email, some required parameters are missing! Writing to std-out." $true
Write-WithTime " $msg" $true
}
else
{
Write-WithTime " $msg" $true
}
}
function Test-ProcessRunning($processName, $count = 1, [ref]$serviceNeedsRestart)
{
Write-WithTime "Checking that $count instance(s) of $processName are running..."
$result = (Get-Process $processName -ea SilentlyContinue | select id | measure).count -ge $count
if (!($result))
{
Handle-Error("Critical Kiln Server error. Process `'$processName`' could not be found!")
$serviceNeedsRestart.value = $true
}
}
function Test-ElasticSearchRunning([ref]$serviceNeedsRestart)
{
Write-WithTime "Checking that 1 instances(s) of ElasticSearch are running..."
$result = $false
(Get-WmiObject win32_process -Filter "name like 'java.exe'") | ForEach-Object {
if (($_.CommandLine | select-string "ElasticSearch" | measure).count -eq 1) {
$result = $true
}
}
if (!($result))
{
Handle-Error("Critical Kiln Server error. Elastic Search is not running!")
$serviceNeedsRestart.value = $true
}
}
function Test-HttpResponse($url, $searchString, [ref]$serviceNeedsRestart)
{
Write-WithTime "Checking for expected response from URL ($url)..."
try
{
$response = (New-Object net.webclient).DownloadString($url)
$response = (New-Object net.webclient).DownloadString($url)
$result = $response | select-string $searchString
if (!($result))
{
Handle-Error("Critical Kiln Server error. URL [$url] did not contain the expected string `'$searchString`'")
$serviceNeedsRestart.value = $true
}
}
catch [Net.WebException]
{
$exceptionMessage = $_.Exception.Message
Handle-Error("Critical Kiln Server error. Could not reach URL [$url]`n$exceptionMessage")
$serviceNeedsRestart.value = $true
}
}
function Test-QueueStats([ref]$serviceNeedsRestart)
{
try
{
$response = (New-Object System.Net.WebClient).DownloadString($queueStatsUrl)
$stats = Convert-JsonToXml($response)
$statsTime = [long]$stats.root.currentTime."#text"
$timeDiff = (((Get-Date).ToUniversalTime().Ticks / 10e6) - $statsTime)
$queueLength = [int]$stats.root.queueLength."#text"
$runningTaskCount = $stats.root.runningTasks.ChildNodes.count
$oldestTaskAge = [int]$stats.root.oldestRunningTaskTotalSeconds."#text"
Write-WithTime ""
Write-WithTime "--Queue Stats Information--"
Write-WithTime " Time Accuracy: within $timeDiff seconds"
Write-WithTime " Queue Length: $queueLength"
Write-WithTime " Running Task Count: $runningTaskCount"
Write-WithTime " Age of oldest task: $oldestTaskAge seconds"
if (($timeDiff -gt 5) -or ($timeDiff -lt -5))
{
$serviceNeedsRestart.value = $true
}
# This is a warning sign, so just alert, but don't attempt to restart the service yet.
if ($queueLength -gt 50)
{
Handle-Error("The Kiln Queue is exceptionally long ($queueLength tasks), this might by a symptom of a recent large change or a problem.")
}
# This is a warning sign, so just alert, but don't attempt to restart the service yet.
if ($runningTaskCount -gt 16)
{
Handle-Error("Too many tasks ($runningTaskCount) are currently running in the Kiln Queue.")
}
# Longer than 15 minutes on a task? This is a bad sign, time to give it some help!
if ( $oldestTaskAge -gt (15*60) )
{
Handle-Error("Oldest task has been running for $oldestTaskAge seconds! Attempting to retry all running tasks.")
if (test-path $reenqueuePath)
{
Start-Process $reenqueuePath "-retryRunning"
}
else
{
Write-WithTime "Reenqueue script not found at ($reenqueuePath), unable to retry running tasks in Kiln Queue."
}
}
}
catch [Exception]
{
$exceptionMessage = $_.Exception.Message
Handle-Error("Error obtaining or processing Kiln Queue Stats`n$exceptionMessage")
$serviceNeedsRestart.value = $true
}
}
# This is provided as an alternative to ConvertFrom-JSON, which requires Powershell V3.
# Powershell V3 is not on most servers by default and requires a reboot to install, so here we are instead...
# Source: (https://www.cogmotive.com/blog/powershell/parsing-json-in-powershell-xml-the-member-item-is-already-present)
Add-Type -Assembly System.ServiceModel.Web,System.Runtime.Serialization
function Convert-JsonToXml([string]$json)
{
$bytes = [byte[]][char[]]$json
$quotas = [System.Xml.XmlDictionaryReaderQuotas]::Max
$jsonReader = [System.Runtime.Serialization.Json.JsonReaderWriterFactory]::CreateJsonReader($bytes,$quotas)
try
{
$xml = new-object System.Xml.XmlDocument
$xml.Load($jsonReader)
$xml
}
finally
{
$jsonReader.Close()
}
}
########################### END FUNCTION DEFINITIONS ###########################
$tryAutoRestart = !($noRestart) # Needed a negative name for the parameter since the default is true. Sanity rename here.
$kssNeedsRestart = $false
$kqsNeedsRestart = $false
$iisNeedsRestart = $false
Test-ProcessRunning "backend" 1 -serviceNeedsRestart ([ref]$kssNeedsRestart)
Test-ProcessRunning "redis-server" 2 -serviceNeedsRestart ([ref]$kssNeedsRestart)
Test-ElasticSearchRunning -serviceNeedsRestart ([ref]$kssNeedsRestart)
Test-ProcessRunning "QueueService" -serviceNeedsRestart ([ref]$kqsNeedsRestart)
Test-HttpResponse $kilnURL "Log on to Kiln" -serviceNeedsRestart ([ref]$iisNeedsRestart)
Test-HttpResponse $backendVersionUrl '"hg_version"' -serviceNeedsRestart ([ref]$kssNeedsRestart)
Test-HttpResponse $esUrl '"status" : 200' -serviceNeedsRestart ([ref]$kssNeedsRestart)
Test-HttpResponse $queueStatsUrl '"queueName":"Kiln"' -serviceNeedsRestart ([ref]$kqsNeedsRestart)
Test-QueueStats -serviceNeedsRestart ([ref]$kqsNeedsRestart)
Write-Host "`n"
if ($iisNeedsRestart -and $iisSiteName -and $tryAutoRestart)
{
Write-WithTime "Attemping to restart site $iisSiteName in IIS...`n"
$appcmd = $env:SystemRoot + "\system32\inetsrv\appcmd.exe"
$stopArgs = "stop site /site.name:$iisSiteName"
$startArgs = "start site /site.name:$iisSiteName"
Start-Process $appcmd $stopArgs
Start-Process $appcmd $startArgs
}
if ($kssNeedsRestart -and $tryAutoRestart)
{
Write-WithTime "Attempting to restart Kiln Storage Service..."
# Find and stop the java.exe process that is running Elastic Search
(Get-WmiObject win32_process -Filter "name like 'java.exe'") | ForEach-Object {
if (($_.CommandLine | select-string "ElasticSearch" | measure).count -eq 1) {
Stop-Process -id $_.ProcessId -Force -EV Err -EA "SilentlyContinue"
}
}
# Try to stop the processes, but don't show any errors (it might already be stopped)
# This will stop both of the redis-server.exe processes because the have the same name
Stop-Process -name redis-server -Force -EV Err -EA "SilentlyContinue"
Stop-Process -name backend -Force -EV Err -EA "SilentlyContinue"
Start-Service KilnStorageService
}
if ($kqsNeedsRestart -and $tryAutoRestart)
{
Write-WithTime "Attempting to restart Kiln Queuing Service..."
# Try to stop the process, but don't show any errors (it might already be stopped)
Stop-Process -name QueueService -Force -EV Err -EA "SilentlyContinue"
Start-Sleep -s 1
Start-Service "Kiln Queuing Service"
}
write-host "`n"
|
Loading...