From f0583ecb1713469544b63b0874f05e48a950308d Mon Sep 17 00:00:00 2001 From: KeyserSosa Date: Mon, 2 Feb 2009 14:43:27 -0800 Subject: [PATCH] updates to service monitor to allow for each app's tracking of database load. Also 'redirecting...' message on most commonly used ajax'd forms when input is accepted. --- r2/r2/controllers/api.py | 24 ++-- r2/r2/public/static/css/reddit.css | 21 ++- r2/r2/public/static/js/jquery.reddit.js | 12 ++ r2/r2/templates/appservicemonitor.html | 28 +++- r2/supervise_watcher.py | 176 +++++++++++++++++------- 5 files changed, 192 insertions(+), 69 deletions(-) diff --git a/r2/r2/controllers/api.py b/r2/r2/controllers/api.py index 84a0db721..26b1ca221 100644 --- a/r2/r2/controllers/api.py +++ b/r2/r2/controllers/api.py @@ -96,7 +96,7 @@ class ApiController(RedditController): @validatedForm() def ajax_login_redirect(self, form, jquery, dest): - jquery.redirect("/login" + query_string(dict(dest=dest))) + form.redirect("/login" + query_string(dict(dest=dest))) @validate(link = VUrl(['url']), count = VLimit('limit')) @@ -192,7 +192,7 @@ class ApiController(RedditController): if form.has_errors("url", errors.NO_URL, errors.BAD_URL): pass elif form.has_errors("url", errors.ALREADY_SUB): - jquery.redirect(url[0].already_submitted_link) + form.redirect(url[0].already_submitted_link) # check for title, otherwise look it up and return it elif form.has_errors("title", errors.NO_TITLE): # try to fetch the title @@ -256,16 +256,16 @@ class ApiController(RedditController): c.cname = False path = l.make_permalink_slow() c.cname = cname - jquery.redirect(path) + form.redirect(path) - def _login(self, jquery, user, dest='', rem = None): + def _login(self, form, user, dest='', rem = None): """ AJAX login handler, used by both login and register to set the user cookie and send back a redirect. """ self.login(user, rem = rem) dest = dest or request.referer or '/' - jquery.redirect(dest) + form.redirect(dest) @validatedForm(user = VLogin(['user', 'passwd']), @@ -276,7 +276,7 @@ class ApiController(RedditController): if reason and reason[0] == 'redirect': dest = reason[1] if not form.has_errors("passwd", errors.WRONG_PASSWORD): - self._login(jquery, user, dest, rem) + self._login(form, user, dest, rem) @validatedForm(VCaptcha(), @@ -321,7 +321,7 @@ class ApiController(RedditController): for sr, sub in reason[1].iteritems(): self._subscribe(sr, sub) - self._login(jquery, user, dest, rem) + self._login(form, user, dest, rem) @noresponse(VUser(), @@ -482,7 +482,7 @@ class ApiController(RedditController): """ if areyousure1 == areyousure2 == areyousure3 == 'yes': c.user.delete() - jquery.redirect('/?deleted=true') + form.redirect('/?deleted=true') else: form.set_html('.status', _("see? you don't really want to leave")) @@ -981,7 +981,7 @@ class ApiController(RedditController): form.parent().set_html('.status', _("saved")) if redir: - jquery.redirect(redir) + form.redirect(redir) @noresponse(VModhash(), VSrCanBan('id'), @@ -1141,7 +1141,7 @@ class ApiController(RedditController): password = VPassword(['passwd', 'passwd2'])) def POST_resetpassword(self, form, jquery, user, password): if errors.BAD_USERNAME in c.errors: - return jquery.redirect('/password') + return form.redirect('/password') elif (not form.has_errors('passwd', errors.BAD_PASSWORD) and not form.has_errors('passwd2', errors.BAD_PASSWORD_MATCH) and user): @@ -1296,7 +1296,7 @@ class ApiController(RedditController): l._commit() l.update_url_cache(old_url) - jquery.redirect('/promote/edit_promo/%s' % to36(l._id)) + form.redirect('/promote/edit_promo/%s' % to36(l._id)) else: l = Link._submit(title, url, c.user, sr, ip, False) @@ -1312,7 +1312,7 @@ class ApiController(RedditController): promote_until = promote_until, disable_comments = disable_comments) - jquery.redirect('/promote/edit_promo/%s' % to36(l._id)) + form.redirect('/promote/edit_promo/%s' % to36(l._id)) def GET_link_thumb(self, *a, **kw): """ diff --git a/r2/r2/public/static/css/reddit.css b/r2/r2/public/static/css/reddit.css index 6e96cfb54..90977074c 100644 --- a/r2/r2/public/static/css/reddit.css +++ b/r2/r2/public/static/css/reddit.css @@ -985,8 +985,16 @@ textarea.gray { color: gray; } padding-right: 5px; } .wired img {vertical-align: middle;} -.server-status { width: 300px; } -.server-status table { font-size: xx-small; margin-left: 5px; } +.server-status { width: 300px; } +.server-status table { + font-size: xx-small; + margin-left: 5px; + border-top: #BCBCBC solid 1px; + border-left: #BCBCBC solid 1px; + border-bottom: #E0E0E0 solid 1px; + border-right: #E0E0E0 solid 1px; + margin-bottom: 5px; +} .server-status td { padding-right: 2px; padding-left: 2px; } .server-status .bar { height: 5px; background-color: blue; } .server-status .load0 { background-color: #FFFFFF; } @@ -995,12 +1003,21 @@ textarea.gray { color: gray; } .server-status .load3 { background-color: #FFEA71; } .server-status .load4 { background-color: #FF9191; } .server-status .load5 { background-color: #FF0000; color: #FFFFFF } +.server-status tr.down > * { + background-color: #C0C0C0; + text-decoration: line-through; +} .server-status th { font-weight: bold; padding-right: 2px; } .server-status tr.title-region { cursor: pointer; } .server-status tr.title-region:hover > td, .server-status tr.title-region:hover > th { text-decoration: underline; } +.server-status tr.title-region.empty { cursor: default; opacity: 0.7; } +.server-status tr.title-region.empty:hover > td, +.server-status tr.title-region.empty:hover > th { text-decoration: none; } + + .server-status .pegged { background-color: red; font-weight: bold; diff --git a/r2/r2/public/static/js/jquery.reddit.js b/r2/r2/public/static/js/jquery.reddit.js index 35f1431aa..2f8875307 100644 --- a/r2/r2/public/static/js/jquery.reddit.js +++ b/r2/r2/public/static/js/jquery.reddit.js @@ -17,11 +17,23 @@ $.log = function(message) { }; $.debug = $.log; +$.fn.debug = function() { + $.debug($(this)); + return $(this); +} $.redirect = function(dest) { window.location = dest; }; +$.fn.redirect = function(dest) { + /* for forms which are "posting" by ajax leading to a redirect */ + $(this).filter("form").find(".status").show().html("redirecting..."); + $.redirect(dest); + /* this should never happen, but for the sake of internal consistency */ + return $(this) +} + $.refresh = function() { window.location.reload(true); }; diff --git a/r2/r2/templates/appservicemonitor.html b/r2/r2/templates/appservicemonitor.html index ac2bddc26..0bba7e287 100644 --- a/r2/r2/templates/appservicemonitor.html +++ b/r2/r2/templates/appservicemonitor.html @@ -61,6 +61,24 @@ ${host.database.connections(300)} + <% + qcount = host.database.query_count \ + if hasattr(host.database, "query_count") else None + %> + %if qcount: + + + + + query count: + + + ${qcount()} +  /  + ${qcount(300)} + + + %endif by ip: @@ -115,11 +133,11 @@ %for host in thing.hostlogs: <% host_id = host.host.replace('.', '-') - s = host.services load = host.load() load_level = min(max(int(load+0.5), 0),5) + empty_cls = '' if len(host.services) else 'empty' %> - + ${host.host} load: ${load} @@ -137,9 +155,11 @@ mem_wid = int(mem_col/25*min(25, int(service.mem()))) cpu_60_wid = int(cpu_col/100*min(100,int(service.cpu(60)))) cpu_300_wid = int(cpu_col/100*min(100,int(service.cpu(300)))) + is_down = 'down' if service.pid < 0 else '' %> - + %if g.reddit_host == host.host and g.reddit_pid == service.pid: » @@ -165,7 +185,7 @@ else: age = "%d min" % age %> - ${age} + ${age if service.pid > 0 else 'down'} %endfor diff --git a/r2/supervise_watcher.py b/r2/supervise_watcher.py index 450ef3788..0b8b8b187 100644 --- a/r2/supervise_watcher.py +++ b/r2/supervise_watcher.py @@ -20,28 +20,87 @@ # CondeNet, Inc. All Rights Reserved. ################################################################################ #!/usr/bin/env python -from pylons import g import os, re, sys, socket, time, smtplib import subprocess from datetime import datetime, timedelta from r2.lib.wrapped import Wrapped -host = g.reddit_host -default_services = ['newreddit'] -def is_db_machine(host): +class AppServiceMonitor(Wrapped): """ - Given a host name, checks the list of known DB machines to - determine if the host is one of them. - """ - for db in g.databases: - ip = list(g.to_iter(getattr(g, db + "_db")))[1] - name = socket.gethostbyaddr(ip)[0] - if (name == host or ("." in host and name.endswith("." + host)) or - name.startswith(host + ".")): - return True + Master controller class for service monitoring. Can be + initialized at the same time as pylons.g provided g is passed in + as the global_config argument. This class has three purposes: - return False + * Fetches Hostlogger instances from the cache for generating + reports (by calling render() as it is a subclass of wrapped). + + * keeping track of which machines are DB machines, allowing db + load to be checked and improving load balancing. + + * monitoring the local host's load and storing it in the cache. + + """ + + def __init__(self, hosts = None, global_conf = None): + """ + hosts is a list of machine hostnames to be tracked (will + default to global_conf.monitored_servers if not provided). + Note the ability to pass in the global_conf (aka pylons.g) + to allow for initializing before the app has finished loading. + """ + if not global_conf: + from pylons import g + global_conf = g + self.global_conf = global_conf + self._hosts = hosts or global_conf.monitored_servers + + db_info = {} + for db in global_conf.databases: + dbase, ip = list(global_conf.to_iter( + getattr(global_conf, db + "_db")))[:2] + name = socket.gethostbyaddr(ip)[0] + + for host in global_conf.monitored_servers: + if (name == host or + ("." in host and name.endswith("." + host)) or + name.startswith(host + ".")): + db_info[db] = (dbase, ip, host) + + self._db_info = db_info + self.hostlogs = [] + Wrapped.__init__(self) + + def database_load(self, db_name): + if self._db_info.has_key(db_name): + return self.server_load(self._db_info[db_name][-1]) + + @staticmethod + def server_load(mach_name): + h = HostLogger.from_cache(host, self.global_conf) + return h.load.most_recent() + + def __iter__(self): + return iter(self.hostlogs) + + def render(self, *a, **kw): + self.hostlogs = [HostLogger.from_cache(host, self.global_conf) + for host in self._hosts] + self.hostlogs = filter(None, self.hostlogs) + return Wrapped.render(self, *a, **kw) + + def monitor(self, *a, **kw): + host = self.global_conf.reddit_host + h = (HostLogger.from_cache(host, self.global_conf) or + HostLogger(host, self)) + return h.monitor(self, *a, **kw) + + def is_db_machine(self, host): + """ + Given a host name, checks the list of known DB machines to + determine if the host is one of them. + """ + return any(host == name for d2,ip,name in self._db_info.values()) class DataLogger(object): @@ -100,9 +159,11 @@ class Database(object): self.connections = DataLogger() self.ip_conn = {} self.db_conn = {} + self.query_count = DataLogger() - def track(self, conn = 0, ip_conn = {}, db_conn = {}, vacuums = {}): + def track(self, conn = 0, ip_conn = {}, db_conn = {}, vacuums = {}, + query_count = None): #log the number of connections self.connections.add(conn) @@ -119,16 +180,23 @@ class Database(object): # log vacuuming self.vacuuming = [k for k, v in vacuums.iteritems() if v] - + + # has a query count + if query_count is not None: + self.query_count.add(query_count) class HostLogger(object): cache_key = "machine_datalog_data_" - def __init__(self, host): + @classmethod + def cache(self, global_conf): + return global_conf.rendercache + + def __init__(self, host, master): self.host = host self.load = DataLogger() self.services = {} - self.database = Database() if is_db_machine(host) else None + self.database = Database() if master.is_db_machine(host) else None def service_pids(self): return self.services.keys() @@ -147,24 +215,25 @@ class HostLogger(object): else: self.services[pid].age = int(age / 60) - def set_cache(self): + def set_cache(self, global_conf): key = self.cache_key + str(self.host) - g.rendercache.set(key, self) + self.cache(global_conf).set(key, self) @classmethod - def from_cache(cls, host): + def from_cache(cls, host, global_conf): key = cls.cache_key + str(host) - return g.rendercache.get(key) + return cls.cache(global_conf).get(key) def clean_dead(self, age = 10): time = datetime.now() for pid, s in list(self.services.iteritems()): t = s.last_update() - if not t or t < time - timedelta(0, age): + if not t or t < time - timedelta(0, age) or pid < 0: del self.services[pid] - def monitor(self, srvname = None, loop = True, loop_time = 2, + def monitor(self, service_monitor, + srvname = None, loop = True, loop_time = 2, srv_params = {}, top_params = {}, db_params = {}): while True: # (re)populate the service listing @@ -187,10 +256,8 @@ class HostLogger(object): self.load.add(float(foo.split(' ')[1].strip(','))) handle.close() - - self.clean_dead() - self.set_cache() + self.set_cache(service_monitor.global_conf) if loop: time.sleep(loop_time) @@ -205,15 +272,6 @@ class HostLogger(object): yield s[pid] -class AppServiceMonitor(Wrapped): - def __init__(self, hosts = None): - hosts = hosts or g.monitored_servers - self.hostlogs = [HostLogger.from_cache(host) for host in hosts] - self.hostlogs = filter(lambda x: x, self.hostlogs) - - def __iter__(self): - return iter(self.hostlogs) - def Alert(restart_list = ['MEM','CPU'], alert_recipients = ['nerds@reddit.com'], @@ -223,7 +281,7 @@ def Alert(restart_list = ['MEM','CPU'], p = re.compile("newreddit(\d+)") cache_key = 'already_alerted_' - + from pylons import g for host in AppServiceMonitor(g.monitored_servers): for service in host: # cpu values @@ -292,7 +350,7 @@ def run_top(proc_ids = [], name = '', exe = "/usr/bin/top"): if not os.path.exists(exe): raise ValueError, "bad executable specified for top" - cmd = [exe, '-b', '-n1'] + ["-p%d" % x for x in proc_ids] + cmd = [exe, '-b', '-n1'] + ["-p%d" % x for x in proc_ids if x > 0] handle = subprocess.Popen(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE) @@ -316,19 +374,23 @@ def run_top(proc_ids = [], name = '', exe = "/usr/bin/top"): def supervise_list(exe = "/usr/local/bin/svstat", path = '/service/'): handle = os.popen("%s %s*" % (exe, path)) + defunct = 0 for line in handle: + line = line.split(' ') + name = line[0] try: - name, status, blah, pid, time, label = line.split(' ')[:6] + status, blah, pid, time = line[1:5] name = name[len(path):].strip(':') if status == 'up': pid = int(pid.strip(')')) time = int(time) else: - pid = -1 - time = 0 - yield (name, status, pid, time) + raise ValueError, "down process" except ValueError: - pass + defunct += 1 + pid = -defunct + time = 0 + yield (name, "down", pid, time) handle.close() def check_database(proc = "postgres", check_vacuum = True, user='ri'): @@ -354,21 +416,33 @@ def check_database(proc = "postgres", check_vacuum = True, user='ri'): vacuums = {} if check_vacuum: - vac = ("(echo '\t'; echo 'select * from active;') " + + vac = ("(echo '\\t'; echo 'select * from active;') " + "| psql -U %(user)s %(db)s | grep -i '| vacuum'") for db in by_db: handle = os.popen(vac % dict(user=user, db=db)) vacuums[db] = bool(handle.read()) handle.close() - - return dict(conn = total, - ip_conn = by_ip, - db_conn = by_db, - vacuums = vacuums) + + res = dict(conn = total, ip_conn = by_ip, db_conn = by_db, + vacuums = vacuums) + + if 'query_queue' in by_db: + cmd = ("(echo '\t'; echo 'select count(*) from reddit_query_queue;') " + "| psql -U %(user)s query_queue ") + handle = os.popen(cmd % dict(user = user)) + for line in handle: + try: + res['query_count'] = int(line.strip('\n ')) + break + except ValueError: + continue + handle.close() + return res def Run(*a, **kw): - HostLogger(g.reddit_host).monitor(*a, **kw) + from pylons import g + AppServiceMonitor(global_conf = g).monitor(*a, **kw) def Test(num, load = 1., pid = 0): services = Services() @@ -383,4 +457,4 @@ def Test(num, load = 1., pid = 0): services.set_cache() if __name__ == '__main__': - Run(sys.argv[1:] if sys.argv[1:] else default_services) + Run(sys.argv[1:] if sys.argv[1:] else ['newreddit'])