Skip to content

Commit

Permalink
Add ability to control how unicode decoding errors are handled
Browse files Browse the repository at this point in the history
Adds an errors argument to Reader objects to be used in conjunction with the
encoding argument. Valid values for errors are 'strict' (the default),
'replace', 'ignore', or 'backslashreplace'. These four values are defined
by Python: https://docs.python.org/3/howto/unicode.html#the-string-type

Note this is a backwards incompatible change. Prior to this change
hiredis-py would silence UnicodeDecodeErrors and return the original bytes
object. After this change, the default behavior is to raise the
UnicodeDecodeError when a decoding error is encountered. This matches
Python's default behavior. The user can alter the default behavior by
specifying a different value to errors just like normal Python.
  • Loading branch information
andymccurdy authored and ifduyue committed Dec 30, 2018
1 parent 283b7b9 commit c27b463
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 51 deletions.
17 changes: 12 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,16 +68,23 @@ To do so, specify the encoding you want to use for decoding replies when
initializing it:

```python
>>> reader = hiredis.Reader(encoding="utf-8")
>>> reader = hiredis.Reader(encoding="utf-8", errors="strict")
>>> reader.feed("$3\r\n\xe2\x98\x83\r\n")
>>> reader.gets()
u''
```

When bulk data in a reply could not be properly decoded using the specified
encoding, it will be returned as a plain string. When the encoding cannot be
found, a `LookupError` will be raised after calling `gets` for the first reply
with bulk data (identical to what Python's `unicode` method would do).
Decoding of bulk data will be attempted using the specified encoding and
error handler. If the error handler is `'strict'` (the default), a
`UnicodeDecodeError` is raised when data cannot be dedcoded. This is identical
to Python's default behavior. Other valid values to `errors` include
`'replace'`, `'ignore'`, and `'backslashreplace'`. More information on the
behavior of these error handlers can be found
[here](https://docs.python.org/3/howto/unicode.html#the-string-type).


When the specified encoding cannot be found, a `LookupError` will be raised
when calling `gets` for the first reply with bulk data.

#### Error handling

Expand Down
68 changes: 28 additions & 40 deletions src/reader.c
Original file line number Diff line number Diff line change
Expand Up @@ -79,25 +79,17 @@ static PyObject *createDecodedString(hiredis_ReaderObject *self, const char *str
if (self->encoding == NULL || !self->shouldDecode) {
obj = PyBytes_FromStringAndSize(str, len);
} else {
obj = PyUnicode_Decode(str, len, self->encoding, NULL);
obj = PyUnicode_Decode(str, len, self->encoding, self->errors);
if (obj == NULL) {
if (PyErr_ExceptionMatches(PyExc_ValueError)) {
/* Ignore encoding and simply return plain string. */
obj = PyBytes_FromStringAndSize(str, len);
} else {
assert(PyErr_ExceptionMatches(PyExc_LookupError));

/* Store error when this is the first. */
if (self->error.ptype == NULL)
PyErr_Fetch(&(self->error.ptype), &(self->error.pvalue),
&(self->error.ptraceback));

/* Return Py_None as placeholder to let the error bubble up and
* be used when a full reply in Reader#gets(). */
obj = Py_None;
Py_INCREF(obj);
}
/* Store error when this is the first. */
if (self->error.ptype == NULL)
PyErr_Fetch(&(self->error.ptype), &(self->error.pvalue),
&(self->error.ptraceback));

/* Return Py_None as placeholder to let the error bubble up and
* be used when a full reply in Reader#gets(). */
obj = Py_None;
Py_INCREF(obj);
PyErr_Clear();
}
}
Expand Down Expand Up @@ -173,9 +165,9 @@ redisReplyObjectFunctions hiredis_ObjectFunctions = {
};

static void Reader_dealloc(hiredis_ReaderObject *self) {
// we don't need to free self->encoding as the buffer is managed by Python
// https://docs.python.org/3/c-api/arg.html#strings-and-buffers
redisReplyReaderFree(self->reader);
if (self->encoding)
free(self->encoding);
Py_XDECREF(self->protocolErrorClass);
Py_XDECREF(self->replyErrorClass);

Expand All @@ -198,13 +190,14 @@ static int _Reader_set_exception(PyObject **target, PyObject *value) {
}

static int Reader_init(hiredis_ReaderObject *self, PyObject *args, PyObject *kwds) {
static char *kwlist[] = { "protocolError", "replyError", "encoding", NULL };
static char *kwlist[] = { "protocolError", "replyError", "encoding", "errors", NULL };
PyObject *protocolErrorClass = NULL;
PyObject *replyErrorClass = NULL;
PyObject *encodingObj = NULL;
char *encoding = NULL;
char *errors = NULL;

if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OOO", kwlist,
&protocolErrorClass, &replyErrorClass, &encodingObj))
if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OOss", kwlist,
&protocolErrorClass, &replyErrorClass, &encoding, &errors))
return -1;

if (protocolErrorClass)
Expand All @@ -215,25 +208,19 @@ static int Reader_init(hiredis_ReaderObject *self, PyObject *args, PyObject *kwd
if (!_Reader_set_exception(&self->replyErrorClass, replyErrorClass))
return -1;

if (encodingObj) {
PyObject *encbytes;
char *encstr;
int enclen;

if (PyUnicode_Check(encodingObj))
encbytes = PyUnicode_AsASCIIString(encodingObj);
else
encbytes = PyObject_Bytes(encodingObj);
self->encoding = encoding;
if(errors) {
if (strcmp(errors, "strict") != 0 &&
strcmp(errors, "replace") != 0 &&
strcmp(errors, "ignore") != 0 &&
strcmp(errors, "backslashreplace") != 0) {

if (encbytes == NULL)
PyErr_SetString(PyExc_LookupError,
"if specified, errors must be one of "
"{'strict', 'replace', 'ignore', 'backslashreplace'}");
return -1;

enclen = PyBytes_Size(encbytes);
encstr = PyBytes_AsString(encbytes);
self->encoding = (char*)malloc(enclen+1);
memcpy(self->encoding, encstr, enclen);
self->encoding[enclen] = '\0';
Py_DECREF(encbytes);
}
self->errors = errors;
}

return 0;
Expand All @@ -248,6 +235,7 @@ static PyObject *Reader_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
self->reader->privdata = self;

self->encoding = NULL;
self->errors = "strict"; // default to "strict" to mimic Python
self->shouldDecode = 1;
self->protocolErrorClass = HIREDIS_STATE->HiErr_ProtocolError;
self->replyErrorClass = HIREDIS_STATE->HiErr_ReplyError;
Expand Down
1 change: 1 addition & 0 deletions src/reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ typedef struct {
PyObject_HEAD
redisReader *reader;
char *encoding;
char *errors;
int shouldDecode;
PyObject *protocolErrorClass;
PyObject *replyErrorClass;
Expand Down
22 changes: 16 additions & 6 deletions test/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,17 +141,27 @@ def test_bulk_string_with_encoding(self):
self.reader.feed(b"$3\r\n" + snowman + b"\r\n")
self.assertEquals(snowman.decode("utf-8"), self.reply())

def test_bulk_string_with_other_encoding(self):
snowman = b"\xe2\x98\x83"
self.reader = hiredis.Reader(encoding="utf-32")
self.reader.feed(b"$3\r\n" + snowman + b"\r\n")
self.assertEquals(snowman, self.reply())

def test_bulk_string_with_invalid_encoding(self):
self.reader = hiredis.Reader(encoding="unknown")
self.reader.feed(b"$5\r\nhello\r\n")
self.assertRaises(LookupError, self.reply)

def test_decode_errors_defaults_to_strict(self):
self.reader = hiredis.Reader(encoding="utf-8")
self.reader.feed(b"+\x80\r\n")
self.assertRaises(UnicodeDecodeError, self.reader.gets)

def test_decode_error_with_ignore_errors(self):
self.reader = hiredis.Reader(encoding="utf-8", errors="ignore")
self.reader.feed(b"+\x80value\r\n")
self.assertEquals("value", self.reader.gets())

def test_invalid_encoding_error_handler(self):
self.assertRaises(LookupError, hiredis.Reader, errors='unknown')

def test_reader_with_invalid_error_handler(self):
self.assertRaises(LookupError, hiredis.Reader, encoding="utf-8", errors='foo')

def test_should_decode_false_flag_prevents_decoding(self):
snowman = b"\xe2\x98\x83"
self.reader = hiredis.Reader(encoding="utf-8")
Expand Down

0 comments on commit c27b463

Please sign in to comment.