Add ability to control how unicode decoding errors are handled

Adds an errors argument to Reader objects to be used in conjunction with the encoding argument. Valid values for errors are 'strict' (the default), 'replace', 'ignore', or 'backslashreplace'. These four values are defined by Python: https://docs.python.org/3/howto/unicode.html#the-string-type Note this is a backwards incompatible change. Prior to this change hiredis-py would silence UnicodeDecodeErrors and return the original bytes object. After this change, the default behavior is to raise the UnicodeDecodeError when a decoding error is encountered. This matches Python's default behavior. The user can alter the default behavior by specifying a different value to errors just like normal Python.
siavash119 · Dec 30, 2018 · c27b463 · c27b463
1 parent 283b7b9
commit c27b463
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 51 deletions.
diff --git a/README.md b/README.md
@@ -68,16 +68,23 @@ To do so, specify the encoding you want to use for decoding replies when
 initializing it:
 
 ```python
->>> reader = hiredis.Reader(encoding="utf-8")
+>>> reader = hiredis.Reader(encoding="utf-8", errors="strict")
 >>> reader.feed("$3\r\n\xe2\x98\x83\r\n")
 >>> reader.gets()
 u'☃'
 ```
 
-When bulk data in a reply could not be properly decoded using the specified
-encoding, it will be returned as a plain string. When the encoding cannot be
-found, a `LookupError` will be raised after calling `gets` for the first reply
-with bulk data (identical to what Python's `unicode` method would do).
+Decoding of bulk data will be attempted using the specified encoding and
+error handler. If the error handler is `'strict'` (the default), a
+`UnicodeDecodeError` is raised when data cannot be dedcoded. This is identical
+to Python's default behavior. Other valid values to `errors` include
+`'replace'`, `'ignore'`, and `'backslashreplace'`. More information on the
+behavior of these error handlers can be found
+[here](https://docs.python.org/3/howto/unicode.html#the-string-type).
+
+
+When the specified encoding cannot be found, a `LookupError` will be raised
+when calling `gets` for the first reply with bulk data.
 
 #### Error handling
 

diff --git a/src/reader.c b/src/reader.c
@@ -79,25 +79,17 @@ static PyObject *createDecodedString(hiredis_ReaderObject *self, const char *str
     if (self->encoding == NULL || !self->shouldDecode) {
         obj = PyBytes_FromStringAndSize(str, len);
     } else {
-        obj = PyUnicode_Decode(str, len, self->encoding, NULL);
+        obj = PyUnicode_Decode(str, len, self->encoding, self->errors);
         if (obj == NULL) {
-            if (PyErr_ExceptionMatches(PyExc_ValueError)) {
-                /* Ignore encoding and simply return plain string. */
-                obj = PyBytes_FromStringAndSize(str, len);
-            } else {
-                assert(PyErr_ExceptionMatches(PyExc_LookupError));
-
-                /* Store error when this is the first. */
-                if (self->error.ptype == NULL)
-                    PyErr_Fetch(&(self->error.ptype), &(self->error.pvalue),
-                            &(self->error.ptraceback));
-
-                /* Return Py_None as placeholder to let the error bubble up and
-                 * be used when a full reply in Reader#gets(). */
-                obj = Py_None;
-                Py_INCREF(obj);
-            }
+            /* Store error when this is the first. */
+            if (self->error.ptype == NULL)
+                PyErr_Fetch(&(self->error.ptype), &(self->error.pvalue),
+                        &(self->error.ptraceback));
 
+            /* Return Py_None as placeholder to let the error bubble up and
+             * be used when a full reply in Reader#gets(). */
+            obj = Py_None;
+            Py_INCREF(obj);
             PyErr_Clear();
         }
     }
@@ -173,9 +165,9 @@ redisReplyObjectFunctions hiredis_ObjectFunctions = {
 };
 
 static void Reader_dealloc(hiredis_ReaderObject *self) {
+    // we don't need to free self->encoding as the buffer is managed by Python
+    // https://docs.python.org/3/c-api/arg.html#strings-and-buffers
     redisReplyReaderFree(self->reader);
-    if (self->encoding)
-        free(self->encoding);
     Py_XDECREF(self->protocolErrorClass);
     Py_XDECREF(self->replyErrorClass);
 
@@ -198,13 +190,14 @@ static int _Reader_set_exception(PyObject **target, PyObject *value) {
 }
 
 static int Reader_init(hiredis_ReaderObject *self, PyObject *args, PyObject *kwds) {
-    static char *kwlist[] = { "protocolError", "replyError", "encoding", NULL };
+    static char *kwlist[] = { "protocolError", "replyError", "encoding", "errors", NULL };
     PyObject *protocolErrorClass = NULL;
     PyObject *replyErrorClass = NULL;
-    PyObject *encodingObj = NULL;
+    char *encoding = NULL;
+    char *errors = NULL;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OOO", kwlist,
-        &protocolErrorClass, &replyErrorClass, &encodingObj))
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OOss", kwlist,
+        &protocolErrorClass, &replyErrorClass, &encoding, &errors))
             return -1;
 
     if (protocolErrorClass)
@@ -215,25 +208,19 @@ static int Reader_init(hiredis_ReaderObject *self, PyObject *args, PyObject *kwd
         if (!_Reader_set_exception(&self->replyErrorClass, replyErrorClass))
             return -1;
 
-    if (encodingObj) {
-        PyObject *encbytes;
-        char *encstr;
-        int enclen;
-
-        if (PyUnicode_Check(encodingObj))
-            encbytes = PyUnicode_AsASCIIString(encodingObj);
-        else
-            encbytes = PyObject_Bytes(encodingObj);
+    self->encoding = encoding;
+    if(errors) {
+        if (strcmp(errors, "strict") != 0 &&
+            strcmp(errors, "replace") != 0 &&
+            strcmp(errors, "ignore") != 0 &&
+            strcmp(errors, "backslashreplace") != 0) {
 
-        if (encbytes == NULL)
+            PyErr_SetString(PyExc_LookupError,
+                            "if specified, errors must be one of "
+                            "{'strict', 'replace', 'ignore', 'backslashreplace'}");
             return -1;
-
-        enclen = PyBytes_Size(encbytes);
-        encstr = PyBytes_AsString(encbytes);
-        self->encoding = (char*)malloc(enclen+1);
-        memcpy(self->encoding, encstr, enclen);
-        self->encoding[enclen] = '\0';
-        Py_DECREF(encbytes);
+        }
+        self->errors = errors;
     }
 
     return 0;
@@ -248,6 +235,7 @@ static PyObject *Reader_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
         self->reader->privdata = self;
 
         self->encoding = NULL;
+        self->errors = "strict";  // default to "strict" to mimic Python
         self->shouldDecode = 1;
         self->protocolErrorClass = HIREDIS_STATE->HiErr_ProtocolError;
         self->replyErrorClass = HIREDIS_STATE->HiErr_ReplyError;

diff --git a/src/reader.h b/src/reader.h
@@ -7,6 +7,7 @@ typedef struct {
     PyObject_HEAD
     redisReader *reader;
     char *encoding;
+    char *errors;
     int shouldDecode;
     PyObject *protocolErrorClass;
     PyObject *replyErrorClass;

diff --git a/test/reader.py b/test/reader.py
@@ -141,17 +141,27 @@ def test_bulk_string_with_encoding(self):
     self.reader.feed(b"$3\r\n" + snowman + b"\r\n")
     self.assertEquals(snowman.decode("utf-8"), self.reply())
 
-  def test_bulk_string_with_other_encoding(self):
-    snowman = b"\xe2\x98\x83"
-    self.reader = hiredis.Reader(encoding="utf-32")
-    self.reader.feed(b"$3\r\n" + snowman + b"\r\n")
-    self.assertEquals(snowman, self.reply())
-
   def test_bulk_string_with_invalid_encoding(self):
     self.reader = hiredis.Reader(encoding="unknown")
     self.reader.feed(b"$5\r\nhello\r\n")
     self.assertRaises(LookupError, self.reply)
 
+  def test_decode_errors_defaults_to_strict(self):
+    self.reader = hiredis.Reader(encoding="utf-8")
+    self.reader.feed(b"+\x80\r\n")
+    self.assertRaises(UnicodeDecodeError, self.reader.gets)
+
+  def test_decode_error_with_ignore_errors(self):
+    self.reader = hiredis.Reader(encoding="utf-8", errors="ignore")
+    self.reader.feed(b"+\x80value\r\n")
+    self.assertEquals("value", self.reader.gets())
+
+  def test_invalid_encoding_error_handler(self):
+    self.assertRaises(LookupError, hiredis.Reader, errors='unknown')
+
+  def test_reader_with_invalid_error_handler(self):
+    self.assertRaises(LookupError, hiredis.Reader, encoding="utf-8", errors='foo')
+
   def test_should_decode_false_flag_prevents_decoding(self):
     snowman = b"\xe2\x98\x83"
     self.reader = hiredis.Reader(encoding="utf-8")