Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#2205 #2208

Merged
merged 11 commits into from
Jan 6, 2025
Merged

#2205 #2208

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
20 changes: 8 additions & 12 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,28 +1,21 @@
lint:
python -m pip install --quiet --upgrade pycln isort ruff yamllint
python -m pip install --quiet --upgrade pycln isort ruff yamllint cython-lint
# python -m yamllint .
cython-lint opteryx/compiled/**/*.pyx
python -m ruff check --fix --exit-zero
python -m pycln .
python -m isort .
python -m ruff format opteryx

update:
python -m pip install --upgrade pip
python -m pip install --upgrade -r requirements.txt
python -m pip install --upgrade -r tests/requirements.txt
python -m pip install --upgrade pip uv
python -m uv pip install --upgrade -r tests/requirements.txt
python -m uv pip install --upgrade -r requirements.txt

t:
clear
python tests/sql_battery/test_shapes_and_errors_battery.py

s:
clear
python tests/storage/test_sql_sqlite.py

b:
clear
python scratch/brace.py

test:
clear
export MANUAL_TEST=1
Expand All @@ -40,4 +33,7 @@ coverage:
python -m coverage report --include=opteryx/** -m

compile:
clear
find . -name '*.so' -delete
python setup.py clean
python setup.py build_ext --inplace
2 changes: 1 addition & 1 deletion opteryx/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__build__ = 956
__build__ = 962

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
3 changes: 0 additions & 3 deletions opteryx/compiled/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
# from opteryx.compiled import cython_anyop_eq


from opteryx.compiled.cross_join import build_rows_indices_and_column
2 changes: 0 additions & 2 deletions opteryx/compiled/cross_join/__init__.py

This file was deleted.

6 changes: 0 additions & 6 deletions opteryx/compiled/functions/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +0,0 @@
from .functions import generate_random_strings
from .ip_address import ip_in_cidr
from .vectors import possible_match
from .vectors import possible_match_indices
from .vectors import tokenize_and_remove_punctuation
from .vectors import vectorize
15 changes: 11 additions & 4 deletions opteryx/compiled/functions/functions.pyx
Original file line number Diff line number Diff line change
@@ -1,15 +1,21 @@
# cython: language_level=3
# cython: nonecheck=False
# cython: cdivision=True
# cython: initializedcheck=False
# cython: infer_types=True
# cython: wraparound=False
# cython: boundscheck=False

cimport numpy as cnp
import numpy as np
from libc.time cimport time
cimport cython

cdef bytes alphabet = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_/"

# Seed for xorshift32 PRNG
cdef unsigned int xorshift32_state = <unsigned int>time(NULL)

@cython.boundscheck(False)
@cython.wraparound(False)

def generate_random_strings(int row_count, int width) -> cnp.ndarray:
"""
Generates a NumPy array of random fixed-width strings, repeated `row_count` times.
Expand Down Expand Up @@ -39,11 +45,12 @@ def generate_random_strings(int row_count, int width) -> cnp.ndarray:

return result


cdef inline unsigned int xorshift32():
global xorshift32_state # Declare as global to modify the module-level variable
cdef unsigned int x = xorshift32_state
x ^= (x << 13)
x ^= (x >> 17)
x ^= (x << 5)
xorshift32_state = x
return x
return x
16 changes: 7 additions & 9 deletions opteryx/compiled/functions/ip_address.pyx
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
# cython: language_level=3
# cython: boundscheck=False
# cython: nonecheck=False
# cython: cdivision=True
# cython: initializedcheck=False
# cython: infer_types=True
# cython: wraparound=False
# cython: nonecheck=False
# cython: overflowcheck=False
# cython: boundscheck=False

from libc.stdint cimport uint32_t, int8_t
from libc.stdlib cimport strtol
from libc.string cimport strchr
from libc.string cimport strlen
from libc.string cimport memset
import numpy as np
cimport numpy as cnp
from cpython cimport PyUnicode_AsUTF8String, PyBytes_GET_SIZE

import cython
from cpython cimport PyUnicode_AsUTF8String


cdef inline uint32_t ip_to_int(const char* ip):
Expand Down Expand Up @@ -43,6 +41,7 @@ cdef inline uint32_t ip_to_int(const char* ip):

return result


def ip_in_cidr(cnp.ndarray ip_addresses, str cidr):

# CIDR validation...
Expand All @@ -53,7 +52,6 @@ def ip_in_cidr(cnp.ndarray ip_addresses, str cidr):
cdef int mask_size
cdef str base_ip_str
cdef list cidr_parts = cidr.split('/')
cdef bytes ip_byte_string
cdef uint32_t arr_len = ip_addresses.shape[0]

base_ip_str, mask_size = cidr_parts[0], int(cidr_parts[1])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
# cython: boundscheck=False

import numpy as np # Required for array allocation
from libc.stdint cimport int64_t, int32_t
cimport cython
from libc.stdint cimport int64_t


cdef inline int64_t min3(int64_t x, int64_t y, int64_t z) nogil:
"""Utility function to find the minimum of three integers."""
Expand Down Expand Up @@ -52,9 +52,9 @@ cpdef int64_t levenshtein(str string1, str string2):
dp[i * len2 + j] = dp[(i - 1) * len2 + (j - 1)]
else:
dp[i * len2 + j] = 1 + min3(
dp[(i - 1) * len2 + j], # Remove
dp[i * len2 + (j - 1)], # Insert
dp[(i - 1) * len2 + (j - 1)] # Replace
dp[(i - 1) * len2 + j], # Remove
dp[i * len2 + (j - 1)], # Insert
dp[(i - 1) * len2 + (j - 1)] # Replace
)

return dp[len1 * len2 + (len2 - 1)]
24 changes: 10 additions & 14 deletions opteryx/compiled/functions/vectors.pyx
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
# cython: language_level=3
# cython: boundscheck=False
# cython: nonecheck=False
# cython: cdivision=True
# cython: initializedcheck=False
# cython: infer_types=True
# cython: wraparound=False
# cython: nonecheck=False
# cython: overflowcheck=False
# cython: boundscheck=False

import numpy as np
cimport numpy as cnp
cimport cython

from libc.stdint cimport uint32_t, uint16_t, uint64_t
from cpython cimport PyUnicode_AsUTF8String, PyBytes_GET_SIZE
Expand Down Expand Up @@ -67,7 +68,6 @@ cdef dict irregular_lemmas = {
b'froze': b'freeze',
b'got': b'get',
b'gave': b'give',
b'went': b'go',
b'grew': b'grow',
b'had': b'have',
b'heard': b'hear',
Expand All @@ -77,7 +77,6 @@ cdef dict irregular_lemmas = {
b'kept': b'keep',
b'knew': b'know',
b'knelt': b'kneel',
b'knew': b'know',
b'led': b'lead',
b'leapt': b'leap',
b'learnt': b'learn',
Expand Down Expand Up @@ -158,15 +157,13 @@ cdef inline uint16_t djb2_hash(char* byte_array, uint64_t length) nogil:
return <uint16_t>(hash_value & 0xFFFF)




def vectorize(list tokens):
cdef cnp.ndarray[cnp.uint16_t, ndim=1] vector = np.zeros(VECTOR_SIZE, dtype=np.uint16)
cdef uint32_t hash_1
cdef uint32_t hash_2
cdef bytes token_bytes
cdef uint32_t token_size

for token_bytes in tokens:
token_size = PyBytes_GET_SIZE(token_bytes)
if token_size > 1:
Expand All @@ -176,7 +173,7 @@ def vectorize(list tokens):
hash_1 = hash_1 & (VECTOR_SIZE - 1)
if vector[hash_1] < 65535:
vector[hash_1] += 1

if vector[hash_2] < 65535:
vector[hash_2] += 1

Expand All @@ -188,7 +185,7 @@ def possible_match(list query_tokens, cnp.ndarray[cnp.uint16_t, ndim=1] vector):
cdef uint16_t hash_2
cdef bytes token_bytes
cdef uint32_t token_size

for token_bytes in query_tokens:
token_size = PyBytes_GET_SIZE(token_bytes)
if token_size > 1:
Expand Down Expand Up @@ -219,7 +216,7 @@ def possible_match_indices(cnp.ndarray[cnp.uint16_t, ndim=1] indices, cnp.ndarra
return True


from libc.string cimport strlen, strcpy, strtok, strchr
from libc.string cimport strlen, strcpy, strtok
from libc.stdlib cimport malloc, free

cdef char* strdup(const char* s) nogil:
Expand Down Expand Up @@ -279,7 +276,7 @@ cpdef list tokenize_and_remove_punctuation(str text, set stop_words):
return tokens


from libc.string cimport strlen, strncmp, strcpy, strcat
from libc.string cimport strlen, strncmp, strcpy


from libc.string cimport strlen, strncmp
Expand Down Expand Up @@ -317,4 +314,3 @@ cpdef inline bytes lemmatize(char* word, int word_len):
return word[:word_len - 1]

return word # Return the original if no suffix matches

Loading
Loading