Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

serialize/deserialize colors to/from bytes instead of strings #1049

Merged
merged 3 commits into from
Jan 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 6 additions & 17 deletions core/pe/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,13 @@
# which should be included with this package. The terms are also available at
# http://www.gnu.org/licenses/gpl-3.0.html

from core.pe._cache import string_to_colors # noqa
from core.pe._cache import bytes_to_colors # noqa


def colors_to_string(colors):
"""Transform the 3 sized tuples 'colors' into a hex string.
def colors_to_bytes(colors):
"""Transform the 3 sized tuples 'colors' into a bytes string.

[(0,100,255)] --> 0064ff
[(1,2,3),(4,5,6)] --> 010203040506
[(0,100,255)] --> b'\x00d\xff'
[(1,2,3),(4,5,6)] --> b'\x01\x02\x03\x04\x05\x06'
"""
return "".join("{:02x}{:02x}{:02x}".format(r, g, b) for r, g, b in colors)


# This function is an important bottleneck of dupeGuru PE. It has been converted to C.
# def string_to_colors(s):
# """Transform the string 's' in a list of 3 sized tuples.
# """
# result = []
# for i in xrange(0, len(s), 6):
# number = int(s[i:i+6], 16)
# result.append((number >> 16, (number >> 8) & 0xff, number & 0xff))
# return result
return b"".join(map(bytes, colors))
4 changes: 2 additions & 2 deletions core/pe/cache.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ from typing import Union, Tuple, List

_block = Tuple[int, int, int]

def colors_to_string(colors: List[_block]) -> str: ... # noqa: E302
def string_to_colors(s: str) -> Union[List[_block], None]: ...
def colors_to_bytes(colors: List[_block]) -> bytes: ... # noqa: E302
def bytes_to_colors(s: bytes) -> Union[List[_block], None]: ...
60 changes: 39 additions & 21 deletions core/pe/cache_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,20 @@
import logging
import sqlite3 as sqlite

from core.pe.cache import string_to_colors, colors_to_string
from core.pe.cache import bytes_to_colors, colors_to_bytes


class SqliteCache:
"""A class to cache picture blocks in a sqlite backend."""

schema_version = 1
schema_version_description = "Changed from string to bytes for blocks."

create_table_query = "CREATE TABLE IF NOT EXISTS pictures(path TEXT, mtime_ns INTEGER, blocks BLOB)"
create_index_query = "CREATE INDEX IF NOT EXISTS idx_path on pictures (path)"
drop_table_query = "DROP TABLE IF EXISTS pictures"
drop_index_query = "DROP INDEX IF EXISTS idx_path"

def __init__(self, db=":memory:", readonly=False):
# readonly is not used in the sqlite version of the cache
self.dbname = db
Expand All @@ -40,7 +48,7 @@ def __getitem__(self, key):
sql = "select blocks from pictures where path = ?"
result = self.con.execute(sql, [key]).fetchone()
if result:
result = string_to_colors(result[0])
result = bytes_to_colors(result[0])
return result
else:
raise KeyError(key)
Expand All @@ -56,15 +64,15 @@ def __len__(self):
return result[0][0]

def __setitem__(self, path_str, blocks):
blocks = colors_to_string(blocks)
blocks = colors_to_bytes(blocks)
if op.exists(path_str):
mtime = int(os.stat(path_str).st_mtime)
else:
mtime = 0
if path_str in self:
sql = "update pictures set blocks = ?, mtime = ? where path = ?"
sql = "update pictures set blocks = ?, mtime_ns = ? where path = ?"
else:
sql = "insert into pictures(blocks,mtime,path) values(?,?,?)"
sql = "insert into pictures(blocks,mtime_ns,path) values(?,?,?)"
try:
self.con.execute(sql, [blocks, mtime, path_str])
except sqlite.OperationalError:
Expand All @@ -73,18 +81,9 @@ def __setitem__(self, path_str, blocks):
logging.warning("DatabaseError while setting value for key %r: %s", path_str, str(e))

def _create_con(self, second_try=False):
def create_tables():
logging.debug("Creating picture cache tables.")
self.con.execute("drop table if exists pictures")
self.con.execute("drop index if exists idx_path")
self.con.execute("create table pictures(path TEXT, mtime INTEGER, blocks TEXT)")
self.con.execute("create index idx_path on pictures (path)")

self.con = sqlite.connect(self.dbname, isolation_level=None)
try:
self.con.execute("select path, mtime, blocks from pictures where 1=2")
except sqlite.OperationalError: # new db
create_tables()
self.con = sqlite.connect(self.dbname, isolation_level=None)
self._check_upgrade()
except sqlite.DatabaseError as e: # corrupted db
if second_try:
raise # Something really strange is happening
Expand All @@ -93,6 +92,25 @@ def create_tables():
os.remove(self.dbname)
self._create_con(second_try=True)

def _check_upgrade(self) -> None:
with self.con as conn:
has_schema = conn.execute(
"SELECT NAME FROM sqlite_master WHERE type='table' AND name='schema_version'"
).fetchall()
version = None
if has_schema:
version = conn.execute("SELECT version FROM schema_version ORDER BY version DESC").fetchone()[0]
else:
conn.execute("CREATE TABLE schema_version (version int PRIMARY KEY, description TEXT)")
if version != self.schema_version:
conn.execute(self.drop_table_query)
conn.execute(
"INSERT OR REPLACE INTO schema_version VALUES (:version, :description)",
{"version": self.schema_version, "description": self.schema_version_description},
)
conn.execute(self.create_table_query)
conn.execute(self.create_index_query)

def clear(self):
self.close()
if self.dbname != ":memory:":
Expand Down Expand Up @@ -120,7 +138,7 @@ def get_id(self, path):
def get_multiple(self, rowids):
sql = "select rowid, blocks from pictures where rowid in (%s)" % ",".join(map(str, rowids))
cur = self.con.execute(sql)
return ((rowid, string_to_colors(blocks)) for rowid, blocks in cur)
return ((rowid, bytes_to_colors(blocks)) for rowid, blocks in cur)

def purge_outdated(self):
"""Go through the cache and purge outdated records.
Expand All @@ -129,12 +147,12 @@ def purge_outdated(self):
the db.
"""
todelete = []
sql = "select rowid, path, mtime from pictures"
sql = "select rowid, path, mtime_ns from pictures"
cur = self.con.execute(sql)
for rowid, path_str, mtime in cur:
if mtime and op.exists(path_str):
for rowid, path_str, mtime_ns in cur:
if mtime_ns and op.exists(path_str):
picture_mtime = os.stat(path_str).st_mtime
if int(picture_mtime) <= mtime:
if int(picture_mtime) <= mtime_ns:
# not outdated
continue
todelete.append(rowid)
Expand Down
2 changes: 1 addition & 1 deletion core/pe/matchblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
# to files in other chunks. So chunkifying doesn't save us any actual comparison, but the advantage
# is that instead of reading blocks from disk number_of_files**2 times, we read it
# number_of_files*number_of_chunks times.
# Determining the right chunk size is tricky, bceause if it's too big, too many blocks will be in
# Determining the right chunk size is tricky, because if it's too big, too many blocks will be in
# memory at the same time and we might end up with memory trashing, which is awfully slow. So,
# because our *real* bottleneck is CPU, the chunk size must simply be enough so that the CPU isn't
# starved by Disk IOs.
Expand Down
122 changes: 48 additions & 74 deletions core/pe/modules/cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,94 +2,68 @@
* Created On: 2010-01-30
* Copyright 2014 Hardcoded Software (http://www.hardcoded.net)
*
* This software is licensed under the "BSD" License as described in the "LICENSE" file,
* which should be included with this package. The terms are also available at
* http://www.hardcoded.net/licenses/bsd_license
* This software is licensed under the "BSD" License as described in the
* "LICENSE" file, which should be included with this package. The terms are
* also available at http://www.hardcoded.net/licenses/bsd_license
*/

#include "common.h"

/* I know that there strtol out there, but it requires a pointer to
* a char, which would in turn require me to buffer my chars around,
* making the whole process slower.
*/
static long
xchar_to_long(char c)
{
if ((c >= 48) && (c <= 57)) { /* 0-9 */
return c - 48;
}
else if ((c >= 65) && (c <= 70)) { /* A-F */
return c - 55;
}
else if ((c >= 97) && (c <= 102)) { /* a-f */
return c - 87;
}
return 0;
}
static PyObject *cache_bytes_to_colors(PyObject *self, PyObject *args) {
char *y;
Py_ssize_t char_count, i, color_count;
PyObject *result;
unsigned long r, g, b;
Py_ssize_t ci;
PyObject *color_tuple;

static PyObject*
cache_string_to_colors(PyObject *self, PyObject *args)
{
char *s;
Py_ssize_t char_count, color_count, i;
PyObject *result;
if (!PyArg_ParseTuple(args, "y#", &y, &char_count)) {
return NULL;
}

if (!PyArg_ParseTuple(args, "s#", &s, &char_count)) {
return NULL;
}
color_count = char_count / 3;
result = PyList_New(color_count);
if (result == NULL) {
return NULL;
}

color_count = (char_count / 6);
result = PyList_New(color_count);
if (result == NULL) {
return NULL;
}

for (i=0; i<color_count; i++) {
long r, g, b;
Py_ssize_t ci;
PyObject *color_tuple;

ci = i * 6;
r = (xchar_to_long(s[ci]) << 4) + xchar_to_long(s[ci+1]);
g = (xchar_to_long(s[ci+2]) << 4) + xchar_to_long(s[ci+3]);
b = (xchar_to_long(s[ci+4]) << 4) + xchar_to_long(s[ci+5]);
for (i = 0; i < color_count; i++) {
ci = i * 3;
r = (unsigned char)y[ci];
g = (unsigned char)y[ci + 1];
b = (unsigned char)y[ci + 2];

color_tuple = inttuple(3, r, g, b);
if (color_tuple == NULL) {
Py_DECREF(result);
return NULL;
}
PyList_SET_ITEM(result, i, color_tuple);
color_tuple = inttuple(3, r, g, b);
if (color_tuple == NULL) {
Py_DECREF(result);
return NULL;
}
PyList_SET_ITEM(result, i, color_tuple);
}

return result;
return result;
}

static PyMethodDef CacheMethods[] = {
{"string_to_colors", cache_string_to_colors, METH_VARARGS,
"Transform the string 's' in a list of 3 sized tuples."},
{NULL, NULL, 0, NULL} /* Sentinel */
{"bytes_to_colors", cache_bytes_to_colors, METH_VARARGS,
"Transform the bytes 's' into a list of 3 sized tuples."},
{NULL, NULL, 0, NULL} /* Sentinel */
};

static struct PyModuleDef CacheDef = {
PyModuleDef_HEAD_INIT,
"_cache",
NULL,
-1,
CacheMethods,
NULL,
NULL,
NULL,
NULL
};
static struct PyModuleDef CacheDef = {PyModuleDef_HEAD_INIT,
"_cache",
NULL,
-1,
CacheMethods,
NULL,
NULL,
NULL,
NULL};

PyObject *
PyInit__cache(void)
{
PyObject *m = PyModule_Create(&CacheDef);
if (m == NULL) {
return NULL;
}
return m;
PyObject *PyInit__cache(void) {
PyObject *m = PyModule_Create(&CacheDef);
if (m == NULL) {
return NULL;
}
return m;
}
2 changes: 1 addition & 1 deletion core/pe/modules/common.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ PyObject* inttuple(int n, ...)
result = PyTuple_New(n);

for (i=0; i<n; i++) {
pnumber = PyLong_FromLong(va_arg(numbers, long));
pnumber = PyLong_FromUnsignedLong(va_arg(numbers, long));
if (pnumber == NULL) {
Py_DECREF(result);
return NULL;
Expand Down
25 changes: 13 additions & 12 deletions core/tests/cache_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,40 +10,41 @@
from hscommon.testutil import eq_

try:
from core.pe.cache import colors_to_string, string_to_colors
from core.pe.cache import colors_to_bytes, bytes_to_colors
from core.pe.cache_sqlite import SqliteCache
except ImportError:
skip("Can't import the cache module, probably hasn't been compiled.")


class TestCaseColorsToString:
def test_no_color(self):
eq_("", colors_to_string([]))
eq_(b"", colors_to_bytes([]))

def test_single_color(self):
eq_("000000", colors_to_string([(0, 0, 0)]))
eq_("010101", colors_to_string([(1, 1, 1)]))
eq_("0a141e", colors_to_string([(10, 20, 30)]))
eq_(b"\x00\x00\x00", colors_to_bytes([(0, 0, 0)]))
eq_(b"\x01\x01\x01", colors_to_bytes([(1, 1, 1)]))
eq_(b"\x0a\x14\x1e", colors_to_bytes([(10, 20, 30)]))

def test_two_colors(self):
eq_("000102030405", colors_to_string([(0, 1, 2), (3, 4, 5)]))
eq_(b"\x00\x01\x02\x03\x04\x05", colors_to_bytes([(0, 1, 2), (3, 4, 5)]))


class TestCaseStringToColors:
def test_empty(self):
eq_([], string_to_colors(""))
eq_([], bytes_to_colors(b""))

def test_single_color(self):
eq_([(0, 0, 0)], string_to_colors("000000"))
eq_([(2, 3, 4)], string_to_colors("020304"))
eq_([(10, 20, 30)], string_to_colors("0a141e"))
eq_([(0, 0, 0)], bytes_to_colors(b"\x00\x00\x00"))
eq_([(2, 3, 4)], bytes_to_colors(b"\x02\x03\x04"))
eq_([(10, 20, 30)], bytes_to_colors(b"\x0a\x14\x1e"))

def test_two_colors(self):
eq_([(10, 20, 30), (40, 50, 60)], string_to_colors("0a141e28323c"))
eq_([(10, 20, 30), (40, 50, 60)], bytes_to_colors(b"\x0a\x14\x1e\x28\x32\x3c"))

def test_incomplete_color(self):
# don't return anything if it's not a complete color
eq_([], string_to_colors("102"))
eq_([], bytes_to_colors(b"\x01"))
eq_([(1, 2, 3)], bytes_to_colors(b"\x01\x02\x03\x04"))


class BaseTestCaseCache:
Expand Down