Changeset f2974d8da963…
Parent 0ad9404448f7…
by Quentin Schroeder <quentin@fogcreek.com>
Changes to one file · Browse files at f2974d8da963 Showing diff from parent 0ad9404448f7 Diff from another changeset...
|
|
- #-----------------------------------------------------------------------------
-# This script will check that Kiln is running and healthy by hitting a variety of HTTP endpoints
-#
-# Usage example:
-# ./Monitor-Kiln.ps1 -kilnURL "http://localhost/fogbugz/kiln"
-#
-#-----------------------------------------------------------------------------
+<#
+.SYNOPSIS
+ Checks the vital signs of a Kiln Server
+.DESCRIPTION
+ This script will check a variety of vital signs to see if a Kiln Server is in good health.
+ When errors are found, the script can write them to std-out, email a message, and/or attempt
+ to automatically restart the necessary Kiln Services.
-param([string]$kilnURL = "http://localhost/fogbugz/kiln",
+.EXAMPLE
+ ./Monitor-Kiln.ps1 -kilnURL "http://localhost/fogbugz/kiln"
+ This is the most basic check, it will simply perform checks on all Kiln services at their default locations, write
+ information to std-out, and attempt to automatically restart the Kiln Storage Service and Kiln Queuing
+ Service when errors are found.
+
+.EXAMPLE
+ ./Monitor-Kiln.ps1 -kilnURL "http://localhost/fogbugz/kiln" -smtpServer smtp.myserver.com -from kiln-error-report@company.com -to kiln-admin@company.com -username bob -password pa55w0rd -smtpSsl
+ This will perform checks on all Kiln services at their default locations, and attempt to automatically
+ restart the Kiln Storage Service and Kiln Queuing Service when errors are found. Additionally, any errors
+ found will generate an email to "kiln-admin@company.com" from "kiln-error-report@company.com" which will be
+ sent via the smtp server "smtp.myserver.com" using SSL and the username "bob" with password "pa55w0rd" to
+ authenticate.
+
+.EXAMPLE
+ ./Monitor-Kiln.ps1 -kilnURL "http://localhost/fogbugz/kiln" -iisSiteName Default
+ This will perform checks on all Kiln services at their default location, write information to std-out,
+ and attempt to automatically restart the Kiln Storage Service and Kiln Queuing Service when errors are
+ found. It will also attempt to restart the IIS website named "Default" if the Kiln front page won't load.
+
+.EXAMPLE
+ ./Monitor-Kiln.ps1 -kilnURL "http://localhost/fogbugz/kiln" -noRestart
+ This will perform checks on all Kiln services at their default location, write information to std-out,
+ but will not attempt to restart any services.
+
+.PARAMETER kilnURL
+ The URL to reach the Kiln web page.
+
+.PARAMETER smtpServer
+ An SMPT server that this script can use to send error messages.
+
+.PARAMETER from
+ The from address to appear in emailed error messages.
+
+.PARAMETER to
+ The destination address for emailing error messages.
+
+.PARAMETER username
+ The username to log into the SMTP server.
+
+.PARAMETER password
+ The password to log into the SMTP server.
+
+.PARAMETER smtpSsl
+ A switch to use SSL when connecting to the SMTP server (by default SSL is off).
+
+.PARAMETER smtpPort
+ The port to connect to the SMTP server on (uses 587 by defualt).
+
+.PARAMETER backendVersionUrl
+ The URL of the Kiln backend. The default value is almost always correct.
+
+.PARAMETER esUrl
+ The URL of the ElasticSearch web front end. The default value is almost always correct.
+
+.PARAMETER queueStatsUrl
+ The URL of the Queue Stats page. The default value is almost always correct.
+
+.PARAMETER iisSiteName
+ The name of the IIS Website that FogBugz and Kiln are running under. If you provide this,
+ the script will attempt to restart IIS if it cannot load the Kiln web page.
+
+.PARAMETER noRestart
+ A flag which when set will force the script to only send alert messages and not
+ attempt to restart any services automatically.
+
+.PARAMETER reenqueuePath
+ The path to the Reenqueue script, this is used to try to resuscitate the Queue when problems are detected.
+
+.NOTES
+ Author: Quentin Schroeder
+ Date: Dec 13, 2012
+#>
+
+
+
+param([Parameter(ParameterSetName="noemail")]
+ [string]$kilnURL = "http://localhost/fogbugz/kiln",
+
+ [Parameter(ParameterSetName="email", Mandatory=$true)]
[string]$smtpServer = "",
+
+ [Parameter(ParameterSetName="email", Mandatory=$true)]
+ [Parameter(ParameterSetName="noemail")]
[string]$from = "",
+
+ [Parameter(ParameterSetName="email", Mandatory=$true)]
+ [Parameter(ParameterSetName="noemail")]
[string]$to = "",
+
+ [Parameter(ParameterSetName="email", Mandatory=$true)]
+ [Parameter(ParameterSetName="noemail")]
+ [string]$username = "",
+
+ [Parameter(ParameterSetName="email", Mandatory=$true)]
+ [Parameter(ParameterSetName="noemail")]
+ [string]$password = "",
+
+ [Parameter(ParameterSetName="email")]
+ [switch]$smtpSsl,
+
+ [Parameter(ParameterSetName="email")]
+ [string]$smtpPort = 587,
+
+ [Parameter(ParameterSetName="noemail")]
[string]$backendVersionUrl = "http://localhost:56783/version",
+
+ [Parameter(ParameterSetName="noemail")]
[string]$esUrl = "http://localhost:9200/",
+
+ [Parameter(ParameterSetName="noemail")]
[string]$queueStatsUrl = "http://localhost:56785/stats.json",
+
+ [Parameter(ParameterSetName="noemail")]
[string]$iisSiteName = "",
+
+ [Parameter(ParameterSetName="noemail")]
[switch]$noRestart,
- [string]$reenqueuePath = ".\reenqueue_2.9_tasks.ps1")
+
+ [Parameter(ParameterSetName="noemail")]
+ [string]$reenqueuePath = ".\reenqueue_2.9_tasks.ps1"
+ )
+
############################# FUNCTION DEFINITIONS #############################
function Write-WithTime($msg, $error=$false){
if ($error)
{
write-host "$(Get-Date -format o) $msg" -foregroundcolor "red"
return
}
write-host "$(Get-Date -format o) $msg"
}
function Handle-Error($msg)
{
Write-WithTime " ERROR!" $true
- if ($smtpServer -and $from -and $to)
+ if ($smtpServer -and $from -and $to -and $username -and $password)
{
- Send-MailMessage -From $from -To $to -SmtpServer $smtpServer -Subject "Kiln Server Error Report!" -Body ""
+ try
+ {
+ $SmtpClient = New-Object Net.Mail.SmtpClient($smtpServer, $smtpPor)
+ $SmtpClient.EnableSsl = $smtpSsl
+ $SmtpClient.Credentials = New-Object System.Net.NetworkCredential($username, $password);
+ $SmtpClient.Send($from, $to, "Kiln Server Error Report!", $msg)
+ }
+ catch [Exception]
+ {
+ $exceptionMessage = $_.Exception.Message
+ Write-WithTime " Unable to send email, an error occurred.`n$exceptionMessage" $true
+ }
+ }
+ elseif ($smtpServer -or $from -or $to -or $username -or $password -or $smtpSsl)
+ {
+ Write-WithTime " Unable to send email, some required parameters are missing!" $true
}
else
{
Write-WithTime " $msg" $true
}
}
function Test-ProcessRunning($processName, $count = 1, [ref]$serviceNeedsRestart)
{
Write-WithTime "Checking that $count instance(s) of $processName are running..."
$result = (Get-Process $processName -ea SilentlyContinue | select id | measure).count -ge $count
if (!($result))
{
Handle-Error("Critical Kiln Server error. Process `'$processName`' could not be found!")
$serviceNeedsRestart.value = $true
}
}
function Test-ElasticSearchRunning([ref]$serviceNeedsRestart)
{
Write-WithTime "Checking that 1 instances(s) of ElasticSearch are running..."
$result = $false
(Get-WmiObject win32_process -Filter "name like 'java.exe'") | ForEach-Object {
if (($_.CommandLine | select-string "ElasticSearch" | measure).count -eq 1) {
$result = $true
}
}
if (!($result))
{
Handle-Error("Critical Kiln Server error. Elastic Search is not running!")
$serviceNeedsRestart.value = $true
}
}
function Test-HttpResponse($url, $searchString, [ref]$serviceNeedsRestart)
{
Write-WithTime "Checking for expected response from URL ($url)..."
try
{
$response = (New-Object net.webclient).DownloadString($url)
$response = (New-Object net.webclient).DownloadString($url)
$result = $response | select-string $searchString
if (!($result))
{
Handle-Error("Critical Kiln Server error. URL [$url] did not contain the expected string `'$searchString`'")
$serviceNeedsRestart.value = $true
}
}
catch [Net.WebException]
{
$exceptionMessage = $_.Exception.Message
Handle-Error("Critical Kiln Server error. Could not reach URL [$url]`n$exceptionMessage")
$serviceNeedsRestart = $true
}
}
function Test-QueueStats([ref]$serviceNeedsRestart)
{
try
{
$response = (New-Object System.Net.WebClient).DownloadString($queueStatsUrl)
$stats = Convert-JsonToXml($response)
$statsTime = [long]$stats.root.currentTime."#text"
$timeDiff = (((Get-Date).ToUniversalTime().Ticks / 10e6) - $statsTime)
$queueLength = [int]$stats.root.queueLength."#text"
$runningTaskCount = $stats.root.runningTasks.ChildNodes.count
$oldestTaskAge = [int]$stats.root.oldestRunningTaskTotalSeconds."#text"
-
+
+ Write-WithTime ""
+ Write-WithTime "--Queue Stats Information--"
+ Write-WithTime " Time reported: $timeDiff seconds from now"
+ Write-WithTime " Queue Length: $queueLength"
+ Write-WithTime " Running Task Count: $runningTaskCount"
+ Write-WithTime " Age of oldest task: $oldestTaskAge seconds"
if (($timeDiff -gt 5) -or ($timeDiff -lt 5))
{
$serviceNeedsRestart = $true
}
# This is a warning sign, so just alert, but don't attempt to restart the service yet.
if ($queueLength -gt 50)
{
Handle-Error("The Kiln Queue is exceptionally long ($queueLength tasks), this might by a symptom of a recent large change or a problem.")
}
# This is a warning sign, so just alert, but don't attempt to restart the service yet.
if ($runningTaskCount -gt 16)
{
Handle-Error("Too many tasks ($runningTaskCount) are currently running in the Kiln Queue.")
}
# Longer than 15 minutes on a task? This is a bad sign, time to give it some help!
if ( $oldestTaskAge -gt (15*60) )
{
- Handle-Error("Oldest task has been running for $oldestTaskAge seconds!")
+ Handle-Error("Oldest task has been running for $oldestTaskAge seconds! Attempting to retry all running tasks.")
if (test-path $reenqueuePath)
{
Start-Process $reenqueuePath "-retryRunning"
}
+ else
+ {
+ Write-WithTime "Reenqueue script not found at ($reenqueuePath), unable to retry running tasks in Kiln Queue."
+ }
}
}
catch [Exception]
{
$exceptionMessage = $_.Exception.Message
Handle-Error("Error obtaining or processing Kiln Queue Stats`n$exceptionMessage")
$serviceNeedsRestart = $true
}
}
# This is provided as an alternative to ConvertFrom-JSON, which requires Powershell V3.
# Powershell V3 is not on most servers by default and requires a reboot to install, so here we are instead...
# Source: (https://www.cogmotive.com/blog/powershell/parsing-json-in-powershell-xml-the-member-item-is-already-present)
Add-Type -Assembly System.ServiceModel.Web,System.Runtime.Serialization
function Convert-JsonToXml([string]$json)
{
$bytes = [byte[]][char[]]$json
$quotas = [System.Xml.XmlDictionaryReaderQuotas]::Max
$jsonReader = [System.Runtime.Serialization.Json.JsonReaderWriterFactory]::CreateJsonReader($bytes,$quotas)
try
{
$xml = new-object System.Xml.XmlDocument
$xml.Load($jsonReader)
$xml
}
finally
{
$jsonReader.Close()
}
}
########################### END FUNCTION DEFINITIONS ###########################
$tryAutoRestart = !($noRestart) # Needed a negative name for the parameter since the default is true. Sanity rename here.
$kssNeedsRestart = $false
$kqsNeedsRestart = $false
$iisNeedsRestart = $false
Test-ProcessRunning "backend" 1 -serviceNeedsRestart ([ref]$kssNeedsRestart)
Test-ProcessRunning "redis-server" 2 -serviceNeedsRestart ([ref]$kssNeedsRestart)
Test-ElasticSearchRunning -serviceNeedsRestart ([ref]$kssNeedsRestart)
Test-ProcessRunning "QueueService" -serviceNeedsRestart ([ref]$kqsNeedsRestart)
Test-HttpResponse $kilnURL "Log on to Kiln" -serviceNeedsRestart ([ref]$iisNeedsRestart)
Test-HttpResponse $backendVersionUrl '"hg_version"' -serviceNeedsRestart ([ref]$kssNeedsRestart)
Test-HttpResponse $esUrl '"status" : 200' -serviceNeedsRestart ([ref]$kssNeedsRestart)
Test-HttpResponse $queueStatsUrl '"queueName":"Kiln"' -serviceNeedsRestart ([ref]$kqsNeedsRestart)
Test-QueueStats -serviceNeedsRestart ([ref]$kqsNeedsRestart)
Write-Host "`n"
if ($iisNeedsRestart -and $iisSiteName -and $tryAutoRestart)
{
Write-WithTime "Attemping to restart site $iisSiteName in IIS...`n"
$appcmd = $env:SystemRoot + "\system32\inetsrv\appcmd.exe"
$stopArgs = "stop site /site.name:$iisSiteName"
$startArgs = "start site /site.name:$iisSiteName"
Start-Process $appcmd $stopArgs
Start-Process $appcmd $startArgs
}
if ($kssNeedsRestart -and $tryAutoRestart)
{
Write-WithTime "Attempting to restart Kiln Storage Service..."
# Find and stop the java.exe process that is running Elastic Search
(Get-WmiObject win32_process -Filter "name like 'java.exe'") | ForEach-Object {
if (($_.CommandLine | select-string "ElasticSearch" | measure).count -eq 1) {
Stop-Process -id $_.ProcessId -Force -EV Err -EA "SilentlyContinue"
}
}
# Try to stop the processes, but don't show any errors (it might already be stopped)
# This will stop both of the redis-server.exe processes because the have the same name
Stop-Process -name redis-server -Force -EV Err -EA "SilentlyContinue"
Stop-Process -name backend -Force -EV Err -EA "SilentlyContinue"
Start-Service KilnStorageService
}
if ($kqsNeedsRestart -and $tryAutoRestart)
{
Write-WithTime "Attempting to restart Kiln Queuing Service..."
# Try to stop the process, but don't show any errors (it might already be stopped)
Stop-Process -name QueueService -Force -EV Err -EA "SilentlyContinue"
Start-Sleep -s 1
Start-Service "Kiln Queuing Service"
}
-
+write-host "`n"
\ No newline at end of file |
Loading...