Skip to content

Commit

Permalink
Change cron reporter to report on job status every 24 hours.
Browse files Browse the repository at this point in the history
  • Loading branch information
seanlip committed Sep 5, 2014
1 parent 363b7c5 commit b6975cb
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 15 deletions.
17 changes: 10 additions & 7 deletions core/controllers/cron.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,18 @@
import feconf


class JobFailureMailerHandler(base.BaseHandler):
class JobStatusMailerHandler(base.BaseHandler):
"""Handler for mailing admin about job failures."""

def get(self):
"""Handles GET requests."""
NINETY_MINUTES_IN_MSECS = 90 * 60 * 1000
TWENTY_FIVE_HOURS_IN_MSECS = 25 * 60 * 60 * 1000

failed_jobs = jobs.get_stuck_jobs(NINETY_MINUTES_IN_MSECS)
failed_jobs = jobs.get_stuck_jobs(TWENTY_FIVE_HOURS_IN_MSECS)
if failed_jobs:
email_subject = 'MapReduce failure alert'
email_message = (
'Some jobs have failed in the past 90 minutes. '
'Some jobs have failed in the past 25 hours. '
'More information:')

for job in failed_jobs:
Expand All @@ -51,7 +52,9 @@ def get(self):
job.retries, job.slice_retries, job.update_time,
job.last_work_item
)
else:
email_subject = 'MapReduce status report'
email_message = 'All MapReduce jobs are running fine.'

email_services.send_mail_to_admin(
feconf.ADMIN_EMAIL_ADDRESS, 'MapReduce failure alert',
email_message)
email_services.send_mail_to_admin(
feconf.ADMIN_EMAIL_ADDRESS, email_subject, email_message)
8 changes: 5 additions & 3 deletions core/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1095,12 +1095,14 @@ def get_stuck_jobs(recency_msecs):
datetime.datetime.utcnow() -
datetime.timedelta(0, 0, 0, recency_msecs))
shard_state_model_class = mapreduce_model.ShardState
recent_job_models = shard_state_model_class.all().filter(
'update_time >', threshold_time)

# TODO(sll): Clean up old jobs so that this query does not have to iterate
# over so many elements in a full table scan.
recent_job_models = shard_state_model_class.all()

stuck_jobs = []
for job_model in recent_job_models:
if job_model.retries > 0:
if job_model.update_time > threshold_time and job_model.retries > 0:
stuck_jobs.append(job_model)

return stuck_jobs
Expand Down
6 changes: 3 additions & 3 deletions cron.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cron:
- description: email admin about job failures
url: /mail/admin/job_failure
schedule: every 1 hours
- description: daily email about mapreduce job statuses
url: /mail/admin/job_status
schedule: every day 16:00
4 changes: 2 additions & 2 deletions main_cron.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
# Register the URLs with the classes responsible for handling them.
urls = [
main.get_redirect_route(
r'/cron/mail/admin/job_failure', cron.JobFailureMailerHandler,
'job_failure_mailer'),
r'/cron/mail/admin/job_status', cron.JobStatusMailerHandler,
'job_failure_mailer'),
]


Expand Down

0 comments on commit b6975cb

Please sign in to comment.