Compare commits
	
		
			3 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 5a0382e8a3 | |||
| e3de4a2ef6 | |||
| 80ad9d9dc3 | 
							
								
								
									
										165
									
								
								watchdog.go
									
									
									
									
									
								
							
							
						
						
									
										165
									
								
								watchdog.go
									
									
									
									
									
								
							| @ -33,6 +33,12 @@ func (s Status) String() string { | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| const ( | ||||
| 	MessageDown   = "went down" | ||||
| 	MessageUp     = "came back up" | ||||
| 	MessageHiccup = "hiccupped" | ||||
| ) | ||||
| 
 | ||||
| type Dog struct { | ||||
| 	Watchdog      string | ||||
| 	Name          string | ||||
| @ -47,15 +53,15 @@ type Dog struct { | ||||
| 	status        Status | ||||
| 	changed       bool | ||||
| 	error         error | ||||
| 	failures      int | ||||
| 	passes        int | ||||
| 	lastFailed    time.Time | ||||
| 	lastPassed    time.Time | ||||
| 	lastNotified  time.Time | ||||
| 	//failures      int | ||||
| 	//passes        int | ||||
| 	//lastFailed    time.Time | ||||
| 	//lastPassed    time.Time | ||||
| 	//lastNotified time.Time | ||||
| } | ||||
| 
 | ||||
| func New(d *Dog) *Dog { | ||||
| 	d.lastPassed = time.Now().Add(-5 * time.Minute) | ||||
| 	//d.lastPassed = time.Now().Add(-5 * time.Minute) | ||||
| 	d.status = StatusUp | ||||
| 	d.changed = false | ||||
| 	return d | ||||
| @ -70,88 +76,87 @@ func (d *Dog) Watch() { | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| // Now that I've added the ability to notify when a server is back up | ||||
| // this definitely needs some refactoring. It's bad now. | ||||
| func (d *Dog) watch() { | ||||
| 	d.Logger <- fmt.Sprintf("Check: '%s'", d.Name) | ||||
| 
 | ||||
| 	err := d.check() | ||||
| 	// This may be up or down | ||||
| 	err := d.hardcheck() | ||||
| 	if nil == err { | ||||
| 		d.Logger <- fmt.Sprintf("Up: '%s'", d.Name) | ||||
| 		// if it's down, coming up, notify | ||||
| 		if d.changed { | ||||
| 			d.notify("came back up") | ||||
| 			d.notify(MessageUp) | ||||
| 		} | ||||
| 		return | ||||
| 	} | ||||
| 
 | ||||
| 	time.Sleep(time.Duration(5) * time.Second) | ||||
| 
 | ||||
| 	err2 := d.check() | ||||
| 	if nil != err2 { | ||||
| 		d.Logger <- fmt.Sprintf("Down: '%s': %s", d.Name, err2) | ||||
| 	} else { | ||||
| 		d.Logger <- fmt.Sprintf("Hiccup: '%s': %s", d.Name, err) | ||||
| 		return | ||||
| 	// If being down is a change, check to see if it's just a hiccup | ||||
| 	if d.changed { | ||||
| 		time.Sleep(time.Duration(5) * time.Second) | ||||
| 		err2 := d.softcheck() | ||||
| 		if nil != err2 { | ||||
| 			// it's really down | ||||
| 			d.Logger <- fmt.Sprintf("Down: '%s': %s", d.Name, err2) | ||||
| 		} else { | ||||
| 			// it's not really down, so reset the change info | ||||
| 			d.changed = false | ||||
| 			d.status = StatusUp | ||||
| 			// and notify of the hiccup | ||||
| 			d.Logger <- fmt.Sprintf("Hiccup: '%s': %s", d.Name, err) | ||||
| 			d.notify(MessageHiccup) | ||||
| 			return | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	// TODO what if the server is flip-flopping rapidly? | ||||
| 	// how to rate limit? | ||||
| 	// "{{ .Server }} is on cooldown for 30 minutes" | ||||
| 
 | ||||
| 	// * We've had success since the last notification | ||||
| 	// * It's been at least 5 minutes since the last notification | ||||
| 	//fiveMinutesAgo := time.Now().Add(-5 * time.Minute) | ||||
| 	//if d.lastPassed.After(d.lastNotified) && d.lastNotified.Before(fiveMinutesAgo) { | ||||
| 	//} | ||||
| 
 | ||||
| 	t := 10 | ||||
| 	for { | ||||
| 		// try to recover, then backoff exponentially | ||||
| 		d.recover() | ||||
| 		time.Sleep(time.Duration(t) * time.Second) | ||||
| 		// backoff | ||||
| 		t *= 2 | ||||
| 		err := d.check() | ||||
| 		if nil != err { | ||||
| 			d.Logger <- fmt.Sprintf("Unrecoverable: '%s': %s", d.Name, err) | ||||
| 		if t > 120 { | ||||
| 			t = 120 | ||||
| 		} | ||||
| 
 | ||||
| 		// We should notify if | ||||
| 		// * The status has changed | ||||
| 		// | ||||
| 		// TODO what if the server is flip-flopping rapidly? | ||||
| 		// how to rate limit? | ||||
| 		// "{{ .Server }} is on cooldown for 30 minutes" | ||||
| 		if d.changed { | ||||
| 			d.notify("went down") | ||||
| 			if StatusUp == d.status { | ||||
| 				break | ||||
| 		err := d.softcheck() | ||||
| 		if nil != err { | ||||
| 			// this is down, and we know it's down | ||||
| 			d.status = StatusDown | ||||
| 			d.Logger <- fmt.Sprintf("Unrecoverable: '%s': %s", d.Name, err) | ||||
| 			if d.changed { | ||||
| 				d.changed = false | ||||
| 				d.notify(MessageDown) | ||||
| 			} | ||||
| 
 | ||||
| 			// * We've had success since the last notification | ||||
| 			// * It's been at least 5 minutes since the last notification | ||||
| 			//fiveMinutesAgo := time.Now().Add(-5 * time.Minute) | ||||
| 			//if d.lastPassed.After(d.lastNotified) && d.lastNotified.Before(fiveMinutesAgo) { | ||||
| 			//} | ||||
| 			//if !failure || d.failures >= 5 { | ||||
| 			// go back to the main 5-minute loop | ||||
| 			//	break | ||||
| 			//} | ||||
| 		} else { | ||||
| 			// it came back up | ||||
| 			d.status = StatusUp | ||||
| 			d.Logger <- fmt.Sprintf("Up: '%s'", d.Name) | ||||
| 			if d.changed { | ||||
| 				// and the downtime was short - just a recovery | ||||
| 				d.notify(MessageHiccup) | ||||
| 			} else { | ||||
| 				// and the downtime was some time | ||||
| 				d.notify(MessageUp) | ||||
| 			} | ||||
| 			d.changed = false | ||||
| 			break | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| func (d *Dog) check() error { | ||||
| 	previousStatus := d.status | ||||
| 
 | ||||
| 	var err error | ||||
| 	defer func() { | ||||
| 		// Are we up, or down? | ||||
| 		if nil != err { | ||||
| 			d.status = StatusDown | ||||
| 			d.failures += 1 | ||||
| 			d.lastFailed = time.Now() | ||||
| 		} else { | ||||
| 			d.status = StatusUp | ||||
| 			d.lastPassed = time.Now() | ||||
| 			d.passes += 1 | ||||
| 			d.Logger <- fmt.Sprintf("Up: '%s'", d.Name) | ||||
| 		} | ||||
| 
 | ||||
| 		// Has that changed? | ||||
| 		if previousStatus != d.status { | ||||
| 			d.changed = true | ||||
| 		} else { | ||||
| 			d.changed = false | ||||
| 		} | ||||
| 	}() | ||||
| 
 | ||||
| func (d *Dog) softcheck() error { | ||||
| 	client := NewHTTPClient() | ||||
| 	response, err := client.Get(d.CheckURL) | ||||
| 	if nil != err { | ||||
| @ -174,7 +179,7 @@ func (d *Dog) check() error { | ||||
| 	} | ||||
| 
 | ||||
| 	if "" != d.Badwords { | ||||
| 		if !bytes.Contains(b, []byte(d.Badwords)) { | ||||
| 		if bytes.Contains(b, []byte(d.Badwords)) { | ||||
| 			err = fmt.Errorf("Down: '%s' Found for '%s'", d.Badwords, d.Name) | ||||
| 			d.Logger <- fmt.Sprintf("%s", err) | ||||
| 			d.error = err | ||||
| @ -185,6 +190,32 @@ func (d *Dog) check() error { | ||||
| 	return nil | ||||
| } | ||||
| 
 | ||||
| func (d *Dog) hardcheck() error { | ||||
| 	previousStatus := d.status | ||||
| 
 | ||||
| 	err := d.softcheck() | ||||
| 
 | ||||
| 	// Are we up, or down? | ||||
| 	if nil != err { | ||||
| 		d.status = StatusDown | ||||
| 		//d.failures += 1 | ||||
| 		//d.lastFailed = time.Now() | ||||
| 	} else { | ||||
| 		d.status = StatusUp | ||||
| 		//d.lastPassed = time.Now() | ||||
| 		//d.passes += 1 | ||||
| 	} | ||||
| 
 | ||||
| 	// Has that changed? | ||||
| 	if previousStatus != d.status { | ||||
| 		d.changed = true | ||||
| 	} else { | ||||
| 		d.changed = false | ||||
| 	} | ||||
| 
 | ||||
| 	return err | ||||
| } | ||||
| 
 | ||||
| func (d *Dog) recover() { | ||||
| 	if "" == d.Recover { | ||||
| 		return | ||||
| @ -214,7 +245,7 @@ func (d *Dog) recover() { | ||||
| 
 | ||||
| func (d *Dog) notify(msg string) { | ||||
| 	d.Logger <- fmt.Sprintf("Notifying the authorities of %s's status change", d.Name) | ||||
| 	d.lastNotified = time.Now() | ||||
| 	//d.lastNotified = time.Now() | ||||
| 
 | ||||
| 	for i := range d.Webhooks { | ||||
| 		name := d.Webhooks[i] | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user