mabel-dev · joocer · Jan 6, 2025 · Jan 5, 2025 · Jan 5, 2025 · Jan 5, 2025
diff --git a/Makefile b/Makefile
@@ -1,28 +1,21 @@
 lint:
-	python -m pip install --quiet --upgrade pycln isort ruff yamllint
+	python -m pip install --quiet --upgrade pycln isort ruff yamllint cython-lint
 #	python -m yamllint .
+	cython-lint opteryx/compiled/**/*.pyx
 	python -m ruff check --fix --exit-zero
 	python -m pycln .
 	python -m isort .
 	python -m ruff format opteryx
 
 update:
-	python -m pip install --upgrade pip
-	python -m pip install --upgrade -r requirements.txt
-	python -m pip install --upgrade -r tests/requirements.txt
+	python -m pip install --upgrade pip uv
+	python -m uv pip install --upgrade -r tests/requirements.txt
+	python -m uv pip install --upgrade -r requirements.txt
 
 t:
 	clear
 	python tests/sql_battery/test_shapes_and_errors_battery.py
 
-s:
-	clear
-	python tests/storage/test_sql_sqlite.py
-
-b:
-	clear
-	python scratch/brace.py
-
 test:
 	clear
 	export MANUAL_TEST=1
@@ -40,4 +33,7 @@ coverage:
 	python -m coverage report --include=opteryx/** -m
 
 compile:
+	clear
+	find . -name '*.so' -delete
+	python setup.py clean
 	python setup.py build_ext --inplace
diff --git a/opteryx/__version__.py b/opteryx/__version__.py
@@ -1,4 +1,4 @@
-__build__ = 956
+__build__ = 962
 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/opteryx/compiled/__init__.py b/opteryx/compiled/__init__.py
@@ -1,4 +1 @@
-# from opteryx.compiled import cython_anyop_eq
 
-
-from opteryx.compiled.cross_join import build_rows_indices_and_column
diff --git a/opteryx/compiled/cross_join/__init__.py b/opteryx/compiled/cross_join/__init__.py
diff --git a/opteryx/compiled/functions/__init__.py b/opteryx/compiled/functions/__init__.py
@@ -1,6 +0,0 @@
-from .functions import generate_random_strings
-from .ip_address import ip_in_cidr
-from .vectors import possible_match
-from .vectors import possible_match_indices
-from .vectors import tokenize_and_remove_punctuation
-from .vectors import vectorize

diff --git a/opteryx/compiled/functions/functions.pyx b/opteryx/compiled/functions/functions.pyx
@@ -1,15 +1,21 @@
+# cython: language_level=3
+# cython: nonecheck=False
+# cython: cdivision=True
+# cython: initializedcheck=False
+# cython: infer_types=True
+# cython: wraparound=False
+# cython: boundscheck=False
+
 cimport numpy as cnp
 import numpy as np
 from libc.time cimport time
-cimport cython
 
 cdef bytes alphabet = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_/"
 
 # Seed for xorshift32 PRNG
 cdef unsigned int xorshift32_state = <unsigned int>time(NULL)
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
+
 def generate_random_strings(int row_count, int width) -> cnp.ndarray:
     """
     Generates a NumPy array of random fixed-width strings, repeated `row_count` times.
@@ -39,11 +45,12 @@ def generate_random_strings(int row_count, int width) -> cnp.ndarray:
 
     return result
 
+
 cdef inline unsigned int xorshift32():
     global xorshift32_state  # Declare as global to modify the module-level variable
     cdef unsigned int x = xorshift32_state
     x ^= (x << 13)
     x ^= (x >> 17)
     x ^= (x << 5)
     xorshift32_state = x
-    return x
+    return x
diff --git a/opteryx/compiled/functions/ip_address.pyx b/opteryx/compiled/functions/ip_address.pyx
@@ -1,19 +1,17 @@
 # cython: language_level=3
-# cython: boundscheck=False
+# cython: nonecheck=False
+# cython: cdivision=True
+# cython: initializedcheck=False
+# cython: infer_types=True
 # cython: wraparound=False
-# cython: nonecheck=False
-# cython: overflowcheck=False
+# cython: boundscheck=False
 
 from libc.stdint cimport uint32_t, int8_t
 from libc.stdlib cimport strtol
-from libc.string cimport strchr
 from libc.string cimport strlen
-from libc.string cimport memset
 import numpy as np
 cimport numpy as cnp
-from cpython cimport PyUnicode_AsUTF8String, PyBytes_GET_SIZE
-
-import cython
+from cpython cimport PyUnicode_AsUTF8String
 
 
 cdef inline uint32_t ip_to_int(const char* ip):
@@ -43,6 +41,7 @@ cdef inline uint32_t ip_to_int(const char* ip):
 
     return result
 
+
 def ip_in_cidr(cnp.ndarray ip_addresses, str cidr):
 
     # CIDR validation...
@@ -53,7 +52,6 @@ def ip_in_cidr(cnp.ndarray ip_addresses, str cidr):
     cdef int mask_size
     cdef str base_ip_str
     cdef list cidr_parts = cidr.split('/')
-    cdef bytes ip_byte_string
     cdef uint32_t arr_len = ip_addresses.shape[0]
 
     base_ip_str, mask_size = cidr_parts[0], int(cidr_parts[1])

diff --git a/...ryx/compiled/levenshtein/clevenshtein.pyx → opteryx/compiled/functions/levenshtein.pyx b/...ryx/compiled/levenshtein/clevenshtein.pyx → opteryx/compiled/functions/levenshtein.pyx
@@ -7,8 +7,8 @@
 # cython: boundscheck=False
 
 import numpy as np  # Required for array allocation
-from libc.stdint cimport int64_t, int32_t
-cimport cython
+from libc.stdint cimport int64_t
+
 
 cdef inline int64_t min3(int64_t x, int64_t y, int64_t z) nogil:
     """Utility function to find the minimum of three integers."""
@@ -52,9 +52,9 @@ cpdef int64_t levenshtein(str string1, str string2):
                 dp[i * len2 + j] = dp[(i - 1) * len2 + (j - 1)]
             else:
                 dp[i * len2 + j] = 1 + min3(
-                    dp[(i - 1) * len2 + j],      # Remove
-                    dp[i * len2 + (j - 1)],      # Insert
-                    dp[(i - 1) * len2 + (j - 1)] # Replace
+                    dp[(i - 1) * len2 + j],  # Remove
+                    dp[i * len2 + (j - 1)],  # Insert
+                    dp[(i - 1) * len2 + (j - 1)]  # Replace
                 )
 
     return dp[len1 * len2 + (len2 - 1)]
diff --git a/opteryx/compiled/functions/vectors.pyx b/opteryx/compiled/functions/vectors.pyx
@@ -1,12 +1,13 @@
 # cython: language_level=3
-# cython: boundscheck=False
+# cython: nonecheck=False
+# cython: cdivision=True
+# cython: initializedcheck=False
+# cython: infer_types=True
 # cython: wraparound=False
-# cython: nonecheck=False
-# cython: overflowcheck=False
+# cython: boundscheck=False
 
 import numpy as np
 cimport numpy as cnp
-cimport cython
 
 from libc.stdint cimport uint32_t, uint16_t, uint64_t
 from cpython cimport PyUnicode_AsUTF8String, PyBytes_GET_SIZE
@@ -67,7 +68,6 @@ cdef dict irregular_lemmas = {
     b'froze': b'freeze',
     b'got': b'get',
     b'gave': b'give',
-    b'went': b'go',
     b'grew': b'grow',
     b'had': b'have',
     b'heard': b'hear',
@@ -77,7 +77,6 @@ cdef dict irregular_lemmas = {
     b'kept': b'keep',
     b'knew': b'know',
     b'knelt': b'kneel',
-    b'knew': b'know',
     b'led': b'lead',
     b'leapt': b'leap',
     b'learnt': b'learn',
@@ -158,15 +157,13 @@ cdef inline uint16_t djb2_hash(char* byte_array, uint64_t length) nogil:
     return <uint16_t>(hash_value & 0xFFFF)
 
 
-
-
 def vectorize(list tokens):
     cdef cnp.ndarray[cnp.uint16_t, ndim=1] vector = np.zeros(VECTOR_SIZE, dtype=np.uint16)
     cdef uint32_t hash_1
     cdef uint32_t hash_2
     cdef bytes token_bytes
     cdef uint32_t token_size
-    
+
     for token_bytes in tokens:
         token_size = PyBytes_GET_SIZE(token_bytes)
         if token_size > 1:
@@ -176,7 +173,7 @@ def vectorize(list tokens):
             hash_1 = hash_1 & (VECTOR_SIZE - 1)
             if vector[hash_1] < 65535:
                 vector[hash_1] += 1
-            
+
             if vector[hash_2] < 65535:
                 vector[hash_2] += 1
 
@@ -188,7 +185,7 @@ def possible_match(list query_tokens, cnp.ndarray[cnp.uint16_t, ndim=1] vector):
     cdef uint16_t hash_2
     cdef bytes token_bytes
     cdef uint32_t token_size
-    
+
     for token_bytes in query_tokens:
         token_size = PyBytes_GET_SIZE(token_bytes)
         if token_size > 1:
@@ -219,7 +216,7 @@ def possible_match_indices(cnp.ndarray[cnp.uint16_t, ndim=1] indices, cnp.ndarra
     return True
 
 
-from libc.string cimport strlen, strcpy, strtok, strchr
+from libc.string cimport strlen, strcpy, strtok
 from libc.stdlib cimport malloc, free
 
 cdef char* strdup(const char* s) nogil:
@@ -279,7 +276,7 @@ cpdef list tokenize_and_remove_punctuation(str text, set stop_words):
     return tokens
 
 
-from libc.string cimport strlen, strncmp, strcpy, strcat
+from libc.string cimport strlen, strncmp, strcpy
 
 
 from libc.string cimport strlen, strncmp
@@ -317,4 +314,3 @@ cpdef inline bytes lemmatize(char* word, int word_len):
         return word[:word_len - 1]
 
     return word  # Return the original if no suffix matches
-
Original file line number	Diff line number	Diff line change
		@@ -1,4 +1 @@
		# from opteryx.compiled import cython_anyop_eq


		from opteryx.compiled.cross_join import build_rows_indices_and_column