Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

serialize/deserialize colors to/from bytes instead of strings #1049

Merged
merged 3 commits into from
Jan 27, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
serialize/deserialize colors to/from bytes instead of strings
it's a tiny bit faster and saves a bit of memory
  • Loading branch information
Dobatymo committed Sep 27, 2022
commit f1153c85c0684497101002aa8adbe8ee6a52d100
23 changes: 6 additions & 17 deletions core/pe/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,13 @@
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html

from core.pe._cache import string_to_colors # noqa
from core.pe._cache import bytes_to_colors # noqa


def colors_to_string(colors):
"""Transform the 3 sized tuples 'colors' into a hex string.
def colors_to_bytes(colors):
"""Transform the 3 sized tuples 'colors' into a bytes string.

[(0,100,255)] --> 0064ff
[(1,2,3),(4,5,6)] --> 010203040506
[(0,100,255)] --> b'\x00d\xff'
[(1,2,3),(4,5,6)] --> b'\x01\x02\x03\x04\x05\x06'
"""
return "".join("{:02x}{:02x}{:02x}".format(r, g, b) for r, g, b in colors)


# This function is an important bottleneck of dupeGuru PE. It has been converted to C.
# def string_to_colors(s):
# """Transform the string 's' in a list of 3 sized tuples.
# """
# result = []
# for i in xrange(0, len(s), 6):
# number = int(s[i:i+6], 16)
# result.append((number >> 16, (number >> 8) & 0xff, number & 0xff))
# return result
return b"".join(map(bytes, colors))
4 changes: 2 additions & 2 deletions core/pe/cache.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ from typing import Union, Tuple, List

_block = Tuple[int, int, int]

def colors_to_string(colors: List[_block]) -> str: ... # noqa: E302
def string_to_colors(s: str) -> Union[List[_block], None]: ...
def colors_to_bytes(colors: List[_block]) -> bytes: ... # noqa: E302
def bytes_to_colors(s: bytes) -> Union[List[_block], None]: ...
8 changes: 4 additions & 4 deletions core/pe/cache_shelve.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import tempfile
from collections import namedtuple

from core.pe.cache import string_to_colors, colors_to_string
from core.pe.cache import bytes_to_colors, colors_to_bytes


def wrap_path(path):
Expand Down Expand Up @@ -57,7 +57,7 @@ def __getitem__(self, key):
skey = self.shelve[wrap_id(key)]
else:
skey = wrap_path(key)
return string_to_colors(self.shelve[skey].blocks)
return bytes_to_colors(self.shelve[skey].blocks)

def __iter__(self):
return (unwrap_path(k) for k in self.shelve if k.startswith("path:"))
Expand All @@ -66,7 +66,7 @@ def __len__(self):
return sum(1 for k in self.shelve if k.startswith("path:"))

def __setitem__(self, path_str, blocks):
blocks = colors_to_string(blocks)
blocks = colors_to_bytes(blocks)
if op.exists(path_str):
mtime = int(os.stat(path_str).st_mtime)
else:
Expand Down Expand Up @@ -114,7 +114,7 @@ def get_multiple(self, rowids):
skey = self.shelve[wrap_id(rowid)]
except KeyError:
continue
yield (rowid, string_to_colors(self.shelve[skey].blocks))
yield (rowid, bytes_to_colors(self.shelve[skey].blocks))

def purge_outdated(self):
"""Go through the cache and purge outdated records.
Expand Down
10 changes: 5 additions & 5 deletions core/pe/cache_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import logging
import sqlite3 as sqlite

from core.pe.cache import string_to_colors, colors_to_string
from core.pe.cache import bytes_to_colors, colors_to_bytes


class SqliteCache:
Expand Down Expand Up @@ -40,7 +40,7 @@ def __getitem__(self, key):
sql = "select blocks from pictures where path = ?"
result = self.con.execute(sql, [key]).fetchone()
if result:
result = string_to_colors(result[0])
result = bytes_to_colors(result[0])
return result
else:
raise KeyError(key)
Expand All @@ -56,7 +56,7 @@ def __len__(self):
return result[0][0]

def __setitem__(self, path_str, blocks):
blocks = colors_to_string(blocks)
blocks = colors_to_bytes(blocks)
if op.exists(path_str):
mtime = int(os.stat(path_str).st_mtime)
else:
Expand All @@ -77,7 +77,7 @@ def create_tables():
logging.debug("Creating picture cache tables.")
self.con.execute("drop table if exists pictures")
self.con.execute("drop index if exists idx_path")
self.con.execute("create table pictures(path TEXT, mtime INTEGER, blocks TEXT)")
self.con.execute("create table pictures(path TEXT, mtime INTEGER, blocks BLOB)")
self.con.execute("create index idx_path on pictures (path)")

self.con = sqlite.connect(self.dbname, isolation_level=None)
Expand Down Expand Up @@ -120,7 +120,7 @@ def get_id(self, path):
def get_multiple(self, rowids):
sql = "select rowid, blocks from pictures where rowid in (%s)" % ",".join(map(str, rowids))
cur = self.con.execute(sql)
return ((rowid, string_to_colors(blocks)) for rowid, blocks in cur)
return ((rowid, bytes_to_colors(blocks)) for rowid, blocks in cur)

def purge_outdated(self):
"""Go through the cache and purge outdated records.
Expand Down
2 changes: 1 addition & 1 deletion core/pe/matchblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
# to files in other chunks. So chunkifying doesn't save us any actual comparison, but the advantage
# is that instead of reading blocks from disk number_of_files**2 times, we read it
# number_of_files*number_of_chunks times.
# Determining the right chunk size is tricky, bceause if it's too big, too many blocks will be in
# Determining the right chunk size is tricky, because if it's too big, too many blocks will be in
# memory at the same time and we might end up with memory trashing, which is awfully slow. So,
# because our *real* bottleneck is CPU, the chunk size must simply be enough so that the CPU isn't
# starved by Disk IOs.
Expand Down
49 changes: 14 additions & 35 deletions core/pe/modules/cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,51 +9,31 @@

#include "common.h"

/* I know that there strtol out there, but it requires a pointer to
* a char, which would in turn require me to buffer my chars around,
* making the whole process slower.
*/
static long
xchar_to_long(char c)
{
if ((c >= 48) && (c <= 57)) { /* 0-9 */
return c - 48;
}
else if ((c >= 65) && (c <= 70)) { /* A-F */
return c - 55;
}
else if ((c >= 97) && (c <= 102)) { /* a-f */
return c - 87;
}
return 0;
}

static PyObject*
cache_string_to_colors(PyObject *self, PyObject *args)
cache_bytes_to_colors(PyObject *self, PyObject *args)
{
char *s;
Py_ssize_t char_count, color_count, i;
char *y;
Py_ssize_t char_count, i, color_count;
PyObject *result;

if (!PyArg_ParseTuple(args, "s#", &s, &char_count)) {
unsigned long r, g, b;
Py_ssize_t ci;
PyObject *color_tuple;

if (!PyArg_ParseTuple(args, "y#", &y, &char_count)) {
return NULL;
}

color_count = (char_count / 6);
color_count = char_count / 3;
result = PyList_New(color_count);
if (result == NULL) {
return NULL;
}

for (i=0; i<color_count; i++) {
long r, g, b;
Py_ssize_t ci;
PyObject *color_tuple;

ci = i * 6;
r = (xchar_to_long(s[ci]) << 4) + xchar_to_long(s[ci+1]);
g = (xchar_to_long(s[ci+2]) << 4) + xchar_to_long(s[ci+3]);
b = (xchar_to_long(s[ci+4]) << 4) + xchar_to_long(s[ci+5]);
ci = i * 3;
r = (unsigned char) y[ci];
g = (unsigned char) y[ci+1];
b = (unsigned char) y[ci+2];

color_tuple = inttuple(3, r, g, b);
if (color_tuple == NULL) {
Expand All @@ -67,8 +47,7 @@ cache_string_to_colors(PyObject *self, PyObject *args)
}

static PyMethodDef CacheMethods[] = {
{"string_to_colors", cache_string_to_colors, METH_VARARGS,
"Transform the string 's' in a list of 3 sized tuples."},
{"bytes_to_colors", cache_bytes_to_colors, METH_VARARGS, "Transform the bytes 's' into a list of 3 sized tuples."},
{NULL, NULL, 0, NULL} /* Sentinel */
};

Expand Down
2 changes: 1 addition & 1 deletion core/pe/modules/common.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ PyObject* inttuple(int n, ...)
result = PyTuple_New(n);

for (i=0; i<n; i++) {
pnumber = PyLong_FromLong(va_arg(numbers, long));
pnumber = PyLong_FromUnsignedLong(va_arg(numbers, long));
if (pnumber == NULL) {
Py_DECREF(result);
return NULL;
Expand Down
2 changes: 1 addition & 1 deletion core/pe/modules/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ int min(int a, int b);
#endif

/* Create a tuple out of an array of integers. */
PyObject* inttuple(int n, ...);
PyObject* inttuple(int n, ...);
25 changes: 13 additions & 12 deletions core/tests/cache_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from hscommon.testutil import eq_

try:
from core.pe.cache import colors_to_string, string_to_colors
from core.pe.cache import colors_to_bytes, bytes_to_colors
from core.pe.cache_sqlite import SqliteCache
from core.pe.cache_shelve import ShelveCache
except ImportError:
Expand All @@ -19,32 +19,33 @@

class TestCaseColorsToString:
def test_no_color(self):
eq_("", colors_to_string([]))
eq_(b"", colors_to_bytes([]))

def test_single_color(self):
eq_("000000", colors_to_string([(0, 0, 0)]))
eq_("010101", colors_to_string([(1, 1, 1)]))
eq_("0a141e", colors_to_string([(10, 20, 30)]))
eq_(b"\x00\x00\x00", colors_to_bytes([(0, 0, 0)]))
eq_(b"\x01\x01\x01", colors_to_bytes([(1, 1, 1)]))
eq_(b"\x0a\x14\x1e", colors_to_bytes([(10, 20, 30)]))

def test_two_colors(self):
eq_("000102030405", colors_to_string([(0, 1, 2), (3, 4, 5)]))
eq_(b"\x00\x01\x02\x03\x04\x05", colors_to_bytes([(0, 1, 2), (3, 4, 5)]))


class TestCaseStringToColors:
def test_empty(self):
eq_([], string_to_colors(""))
eq_([], bytes_to_colors(b""))

def test_single_color(self):
eq_([(0, 0, 0)], string_to_colors("000000"))
eq_([(2, 3, 4)], string_to_colors("020304"))
eq_([(10, 20, 30)], string_to_colors("0a141e"))
eq_([(0, 0, 0)], bytes_to_colors(b"\x00\x00\x00"))
eq_([(2, 3, 4)], bytes_to_colors(b"\x02\x03\x04"))
eq_([(10, 20, 30)], bytes_to_colors(b"\x0a\x14\x1e"))

def test_two_colors(self):
eq_([(10, 20, 30), (40, 50, 60)], string_to_colors("0a141e28323c"))
eq_([(10, 20, 30), (40, 50, 60)], bytes_to_colors(b"\x0a\x14\x1e\x28\x32\x3c"))

def test_incomplete_color(self):
# don't return anything if it's not a complete color
eq_([], string_to_colors("102"))
eq_([], bytes_to_colors(b"\x01"))
eq_([(1, 2, 3)], bytes_to_colors(b"\x01\x02\x03\x04"))


class BaseTestCaseCache:
Expand Down