Skip to content

Commit

Permalink
Introduce output list formatting method to core.jobs.BaseJobManager (o…
Browse files Browse the repository at this point in the history
…ppia#3671)

* Adds some tests for the new _compress_output_list method

* Moves output list compression to its own function.

* _compress_output_list now passes all tests.

* Temporarily disable relative-import

* Allow protected access to method being tested.

* Re-enable relative-import

* Better naming for tests

* Integrates review suggestions

* Improve phrasing.

* Improve naming and documentation of the test-only override setting.

* Minor phrasing improvements.

* Adds testcase for exactly matching max_output_len.

* Typo: %s/intendend/intended

* Reordered tests since some tests rely on previous ones passing.
  • Loading branch information
brianrodri authored and seanlip committed Jul 24, 2017
1 parent c50b196 commit 3bd7c3d
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 21 deletions.
59 changes: 38 additions & 21 deletions core/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,35 +192,52 @@ def register_completion(cls, job_id, output_list):

model.status_code = STATUS_CODE_COMPLETED
model.time_finished_msec = utils.get_current_time_in_millisecs()
model.output = cls._compress_output_list(output_list)
model.put()

cls._post_completed_hook(job_id)

@classmethod
def _compress_output_list(cls, output_list, test_only_max_output_len=None):
"""Returns compressed list of strings within a max length of chars.
Ensures that the payload (i.e., [str(output) for output in output_list])
makes up at most max_output_chars of the final output data.
Args:
output_list: list(*). Collection of objects to be stringified.
test_only_max_output_len: int or None. Overrides the intended max
output len limit when not None.
# TODO(bhenning): Add tests for this.
output_str_list = ['%s' % output_value for output_value in output_list]
Returns:
list(str). The compressed stringified output values.
"""
_MAX_OUTPUT_LEN = 900000

# De-duplicate the lines of output since it's not very useful to repeat
# them.
counter = collections.Counter(list(output_str_list))
output_str_frequency_list = [
(output_str, counter[output_str]) for output_str in counter]
# Consolidate the lines of output since repeating them isn't useful.
counter = collections.Counter(str(output) for output in output_list)
output_str_list = [
line if freq == 1 else '%s (%d times)' % (line, freq)
for (line, freq) in output_str_frequency_list
output_str if count == 1 else '(%dx) %s' % (count, output_str)
for (output_str, count) in counter.iteritems()
]

cutoff_index = 0
total_output_size = 0
# Truncate outputs to fit within given max length.
if test_only_max_output_len is None:
remaining_len = _MAX_OUTPUT_LEN
else:
remaining_len = test_only_max_output_len

for idx, output_str in enumerate(output_str_list):
cutoff_index += 1
total_output_size += len(output_str)
if total_output_size >= _MAX_OUTPUT_LENGTH_CHARS:
max_element_length = (
total_output_size - _MAX_OUTPUT_LENGTH_CHARS)
output_str_list[idx] = output_str[:max_element_length]
output_str_list[idx] += ' <TRUNCATED>'
remaining_len -= len(output_str)
if remaining_len < 0:
# Truncate all remaining output to fit in the limit.
kept_str = output_str[:remaining_len]
output_str_list[idx:] = [
('%s <TRUNCATED>' % kept_str) if kept_str else '<TRUNCATED>'
]
break
model.output = output_str_list[:cutoff_index]
model.put()

cls._post_completed_hook(job_id)
return output_str_list

@classmethod
def register_failure(cls, job_id, error):
Expand Down
50 changes: 50 additions & 0 deletions core/jobs_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,56 @@ def test_cancelling_one_unfinished_job(self):
'Canceled by admin_user_id', DummyJobManager.get_error(job1_id))
self.assertIsNone(DummyJobManager.get_error(job2_id))

def test_compress_output_list_with_single_char_outputs(self):
sample_output_list = [1, 2, 3, 4, 5]
expected = ['1', '2', '3', '<TRUNCATED>']
actual = jobs.BaseJobManager._compress_output_list( # pylint: disable=protected-access
sample_output_list, test_only_max_output_len=3)
self.assertEquals(expected, sorted(actual))

def test_compress_output_list_with_multi_char_outputs(self):
sample_output_list = ['abcd', 'efgh', 'ijkl']
expected = ['abcd', 'efgh', 'ij <TRUNCATED>']
actual = jobs.BaseJobManager._compress_output_list( # pylint: disable=protected-access
sample_output_list, test_only_max_output_len=10)
self.assertEquals(expected, sorted(actual))

def test_compress_output_list_with_zero_max_output_len(self):
sample_output_list = [1, 2, 3]
expected = ['<TRUNCATED>']
actual = jobs.BaseJobManager._compress_output_list( # pylint: disable=protected-access
sample_output_list, test_only_max_output_len=0)
self.assertEquals(expected, sorted(actual))

def test_compress_output_list_with_exact_max_output_len(self):
sample_output_list = ['abc']
expected = ['abc']
actual = jobs.BaseJobManager._compress_output_list( # pylint: disable=protected-access
sample_output_list, test_only_max_output_len=3)
self.assertEquals(expected, sorted(actual))

def test_compress_output_list_with_empty_outputs(self):
sample_output_list = []
expected = []
actual = jobs.BaseJobManager._compress_output_list(sample_output_list) # pylint: disable=protected-access
self.assertEquals(expected, sorted(actual))

def test_compress_output_list_with_duplicate_outputs(self):
sample_output_list = ['bar', 'foo'] * 3
expected = ['(3x) bar', '(3x) foo']
actual = jobs.BaseJobManager._compress_output_list( # pylint: disable=protected-access
sample_output_list,
# Ensure truncation doesn't happen in this test.
test_only_max_output_len=sum(len(s) for s in expected))
self.assertEquals(expected, sorted(actual))

def test_compress_output_list_with_truncated_duplicate_outputs(self):
sample_output_list = ['supercalifragilisticexpialidocious'] * 3
expected = ['(3x) super <TRUNCATED>']
actual = jobs.BaseJobManager._compress_output_list( # pylint: disable=protected-access
sample_output_list, test_only_max_output_len=10)
self.assertEquals(expected, sorted(actual))


SUM_MODEL_ID = 'all_data_id'

Expand Down

0 comments on commit 3bd7c3d

Please sign in to comment.