diff --git a/core/pe/cache.py b/core/pe/cache.py index 31fcb0bb..738037f7 100644 --- a/core/pe/cache.py +++ b/core/pe/cache.py @@ -4,24 +4,13 @@ # which should be included with this package. The terms are also available at # http://www.gnu.org/licenses/gpl-3.0.html -from core.pe._cache import string_to_colors # noqa +from core.pe._cache import bytes_to_colors # noqa -def colors_to_string(colors): - """Transform the 3 sized tuples 'colors' into a hex string. +def colors_to_bytes(colors): + """Transform the 3 sized tuples 'colors' into a bytes string. - [(0,100,255)] --> 0064ff - [(1,2,3),(4,5,6)] --> 010203040506 + [(0,100,255)] --> b'\x00d\xff' + [(1,2,3),(4,5,6)] --> b'\x01\x02\x03\x04\x05\x06' """ - return "".join("{:02x}{:02x}{:02x}".format(r, g, b) for r, g, b in colors) - - -# This function is an important bottleneck of dupeGuru PE. It has been converted to C. -# def string_to_colors(s): -# """Transform the string 's' in a list of 3 sized tuples. -# """ -# result = [] -# for i in xrange(0, len(s), 6): -# number = int(s[i:i+6], 16) -# result.append((number >> 16, (number >> 8) & 0xff, number & 0xff)) -# return result + return b"".join(map(bytes, colors)) diff --git a/core/pe/cache.pyi b/core/pe/cache.pyi index fbf1e8c8..dd59b510 100644 --- a/core/pe/cache.pyi +++ b/core/pe/cache.pyi @@ -2,5 +2,5 @@ from typing import Union, Tuple, List _block = Tuple[int, int, int] -def colors_to_string(colors: List[_block]) -> str: ... # noqa: E302 -def string_to_colors(s: str) -> Union[List[_block], None]: ... +def colors_to_bytes(colors: List[_block]) -> bytes: ... # noqa: E302 +def bytes_to_colors(s: bytes) -> Union[List[_block], None]: ... diff --git a/core/pe/cache_sqlite.py b/core/pe/cache_sqlite.py index ebaa8e66..4cb3c588 100644 --- a/core/pe/cache_sqlite.py +++ b/core/pe/cache_sqlite.py @@ -9,12 +9,20 @@ import logging import sqlite3 as sqlite -from core.pe.cache import string_to_colors, colors_to_string +from core.pe.cache import bytes_to_colors, colors_to_bytes class SqliteCache: """A class to cache picture blocks in a sqlite backend.""" + schema_version = 1 + schema_version_description = "Changed from string to bytes for blocks." + + create_table_query = "CREATE TABLE IF NOT EXISTS pictures(path TEXT, mtime_ns INTEGER, blocks BLOB)" + create_index_query = "CREATE INDEX IF NOT EXISTS idx_path on pictures (path)" + drop_table_query = "DROP TABLE IF EXISTS pictures" + drop_index_query = "DROP INDEX IF EXISTS idx_path" + def __init__(self, db=":memory:", readonly=False): # readonly is not used in the sqlite version of the cache self.dbname = db @@ -40,7 +48,7 @@ def __getitem__(self, key): sql = "select blocks from pictures where path = ?" result = self.con.execute(sql, [key]).fetchone() if result: - result = string_to_colors(result[0]) + result = bytes_to_colors(result[0]) return result else: raise KeyError(key) @@ -56,15 +64,15 @@ def __len__(self): return result[0][0] def __setitem__(self, path_str, blocks): - blocks = colors_to_string(blocks) + blocks = colors_to_bytes(blocks) if op.exists(path_str): mtime = int(os.stat(path_str).st_mtime) else: mtime = 0 if path_str in self: - sql = "update pictures set blocks = ?, mtime = ? where path = ?" + sql = "update pictures set blocks = ?, mtime_ns = ? where path = ?" else: - sql = "insert into pictures(blocks,mtime,path) values(?,?,?)" + sql = "insert into pictures(blocks,mtime_ns,path) values(?,?,?)" try: self.con.execute(sql, [blocks, mtime, path_str]) except sqlite.OperationalError: @@ -73,18 +81,9 @@ def __setitem__(self, path_str, blocks): logging.warning("DatabaseError while setting value for key %r: %s", path_str, str(e)) def _create_con(self, second_try=False): - def create_tables(): - logging.debug("Creating picture cache tables.") - self.con.execute("drop table if exists pictures") - self.con.execute("drop index if exists idx_path") - self.con.execute("create table pictures(path TEXT, mtime INTEGER, blocks TEXT)") - self.con.execute("create index idx_path on pictures (path)") - - self.con = sqlite.connect(self.dbname, isolation_level=None) try: - self.con.execute("select path, mtime, blocks from pictures where 1=2") - except sqlite.OperationalError: # new db - create_tables() + self.con = sqlite.connect(self.dbname, isolation_level=None) + self._check_upgrade() except sqlite.DatabaseError as e: # corrupted db if second_try: raise # Something really strange is happening @@ -93,6 +92,25 @@ def create_tables(): os.remove(self.dbname) self._create_con(second_try=True) + def _check_upgrade(self) -> None: + with self.con as conn: + has_schema = conn.execute( + "SELECT NAME FROM sqlite_master WHERE type='table' AND name='schema_version'" + ).fetchall() + version = None + if has_schema: + version = conn.execute("SELECT version FROM schema_version ORDER BY version DESC").fetchone()[0] + else: + conn.execute("CREATE TABLE schema_version (version int PRIMARY KEY, description TEXT)") + if version != self.schema_version: + conn.execute(self.drop_table_query) + conn.execute( + "INSERT OR REPLACE INTO schema_version VALUES (:version, :description)", + {"version": self.schema_version, "description": self.schema_version_description}, + ) + conn.execute(self.create_table_query) + conn.execute(self.create_index_query) + def clear(self): self.close() if self.dbname != ":memory:": @@ -120,7 +138,7 @@ def get_id(self, path): def get_multiple(self, rowids): sql = "select rowid, blocks from pictures where rowid in (%s)" % ",".join(map(str, rowids)) cur = self.con.execute(sql) - return ((rowid, string_to_colors(blocks)) for rowid, blocks in cur) + return ((rowid, bytes_to_colors(blocks)) for rowid, blocks in cur) def purge_outdated(self): """Go through the cache and purge outdated records. @@ -129,12 +147,12 @@ def purge_outdated(self): the db. """ todelete = [] - sql = "select rowid, path, mtime from pictures" + sql = "select rowid, path, mtime_ns from pictures" cur = self.con.execute(sql) - for rowid, path_str, mtime in cur: - if mtime and op.exists(path_str): + for rowid, path_str, mtime_ns in cur: + if mtime_ns and op.exists(path_str): picture_mtime = os.stat(path_str).st_mtime - if int(picture_mtime) <= mtime: + if int(picture_mtime) <= mtime_ns: # not outdated continue todelete.append(rowid) diff --git a/core/pe/matchblock.py b/core/pe/matchblock.py index bc203175..9af739bd 100644 --- a/core/pe/matchblock.py +++ b/core/pe/matchblock.py @@ -28,7 +28,7 @@ # to files in other chunks. So chunkifying doesn't save us any actual comparison, but the advantage # is that instead of reading blocks from disk number_of_files**2 times, we read it # number_of_files*number_of_chunks times. -# Determining the right chunk size is tricky, bceause if it's too big, too many blocks will be in +# Determining the right chunk size is tricky, because if it's too big, too many blocks will be in # memory at the same time and we might end up with memory trashing, which is awfully slow. So, # because our *real* bottleneck is CPU, the chunk size must simply be enough so that the CPU isn't # starved by Disk IOs. diff --git a/core/pe/modules/cache.c b/core/pe/modules/cache.c index 598337cf..feccfeb5 100644 --- a/core/pe/modules/cache.c +++ b/core/pe/modules/cache.c @@ -2,94 +2,68 @@ * Created On: 2010-01-30 * Copyright 2014 Hardcoded Software (http://www.hardcoded.net) * - * This software is licensed under the "BSD" License as described in the "LICENSE" file, - * which should be included with this package. The terms are also available at - * http://www.hardcoded.net/licenses/bsd_license + * This software is licensed under the "BSD" License as described in the + * "LICENSE" file, which should be included with this package. The terms are + * also available at http://www.hardcoded.net/licenses/bsd_license */ #include "common.h" -/* I know that there strtol out there, but it requires a pointer to - * a char, which would in turn require me to buffer my chars around, - * making the whole process slower. - */ -static long -xchar_to_long(char c) -{ - if ((c >= 48) && (c <= 57)) { /* 0-9 */ - return c - 48; - } - else if ((c >= 65) && (c <= 70)) { /* A-F */ - return c - 55; - } - else if ((c >= 97) && (c <= 102)) { /* a-f */ - return c - 87; - } - return 0; -} +static PyObject *cache_bytes_to_colors(PyObject *self, PyObject *args) { + char *y; + Py_ssize_t char_count, i, color_count; + PyObject *result; + unsigned long r, g, b; + Py_ssize_t ci; + PyObject *color_tuple; -static PyObject* -cache_string_to_colors(PyObject *self, PyObject *args) -{ - char *s; - Py_ssize_t char_count, color_count, i; - PyObject *result; + if (!PyArg_ParseTuple(args, "y#", &y, &char_count)) { + return NULL; + } - if (!PyArg_ParseTuple(args, "s#", &s, &char_count)) { - return NULL; - } + color_count = char_count / 3; + result = PyList_New(color_count); + if (result == NULL) { + return NULL; + } - color_count = (char_count / 6); - result = PyList_New(color_count); - if (result == NULL) { - return NULL; - } - - for (i=0; i