From 5c65b695eded3710b68d709904757d48efeb4ef7 Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Wed, 5 Aug 2020 11:15:47 +0200 Subject: [PATCH 01/25] Transform parse functions into object_set methods Let parse_set_component, parse_template and parse_objects be methods on object_set thus allowing them to operate directly on set attributes. --- lib/extension/dlisio/ext/types.hpp | 16 ++------ lib/src/parse.cpp | 66 +++++++++++------------------- 2 files changed, 27 insertions(+), 55 deletions(-) diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index 3e90bdbdb..dc73ee6d6 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -521,6 +521,10 @@ struct object_set { void parse() noexcept (false); bool parsed = false; + + const char* parse_set_component(const char* cur) noexcept (false); + const char* parse_template(const char* cur) noexcept (false); + const char* parse_objects(const char* cur) noexcept (false); }; struct matcher { @@ -548,18 +552,6 @@ class pool { std::vector< dl::object_set > eflrs; }; -const char* parse_template( const char* begin, - const char* end, - object_template& ) noexcept (false); - - -object_set parse_objects( const char*, const char* ) noexcept (false); -const char* parse_set_component( const char*, - const char*, - ident*, - ident*, - int* ) noexcept (false); - } #endif //DLISIO_EXT_TYPES_HPP diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index 2d16bac95..79ca52ea4 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -682,10 +682,9 @@ bool basic_object::operator != (const basic_object& o) const noexcept (true) { } -const char* parse_template( const char* cur, - const char* end, - object_template& out ) noexcept (false) { +const char* object_set::parse_template(const char* cur) noexcept (false) { object_template tmp; + const char* end = this->record.data.data() + this->record.data.size(); while (true) { if (cur >= end) @@ -693,7 +692,7 @@ const char* parse_template( const char* cur, const auto flags = parse_attribute_descriptor( cur ); if (flags.object) { - swap( tmp, out ); + swap( tmp, this->tmpl ); return cur; } @@ -732,7 +731,7 @@ const char* parse_template( const char* cur, if (cur == end){ debug_warning("Set contains no objects"); - swap( tmp, out ); + swap( tmp, this->tmpl ); return cur; } } @@ -852,16 +851,16 @@ noexcept (false) } } -object_vector parse_objects( const object_template& tmpl, - const dl::ident type, - const char* cur, - const char* end ) noexcept (false) { +} + +const char* object_set::parse_objects(const char* cur) noexcept (false) { + const char* end = this->record.data.data() + this->record.data.size(); object_vector objs; const auto default_object = defaulted_object( tmpl ); - while (true) { - if (std::distance( cur, end ) <= 0) + while (cur != end) { + if (std::distance( cur, end ) < 0) throw std::out_of_range( "unexpected end-of-record" ); auto object_flags = parse_object_descriptor( cur ); @@ -955,20 +954,15 @@ object_vector parse_objects( const object_template& tmpl, } objs.push_back( std::move( current ) ); - - if (cur == end) break; } - return objs; + this->objs = objs; + return cur; } -} +const char* object_set::parse_set_component(const char* cur) noexcept (false) { -const char* parse_set_component( const char* cur, - const char* end, - dl::ident* type, - dl::ident* name, - int* role) { + const char* end = this->record.data.data() + this->record.data.size(); if (std::distance( cur, end ) <= 0) throw std::out_of_range( "eflr must be non-empty" ); @@ -991,40 +985,26 @@ const char* parse_set_component( const char* cur, if (flags.type) cur = cast( cur, tmp_type ); if (flags.name) cur = cast( cur, tmp_name ); - if (type) *type = tmp_type; - if (name) *name = tmp_name; - if (role) *role = tmp_role; + this->type = tmp_type; + this->name = tmp_name; + this->role = tmp_role; return cur; } object_set::object_set(dl::record rec) noexcept (false) { - parse_set_component(rec.data.data(), - rec.data.data() + rec.data.size(), - &this->type, - &this->name, - &this->role); this->record = std::move(rec); + parse_set_component(this->record.data.data()); } void object_set::parse() noexcept (false) { if (this->parsed) return; - const char* beg = this->record.data.data(); - const char* end = beg + this->record.data.size(); - - /* Skip past the set component as it's already been read and parsed */ - auto cur = parse_set_component(beg, end, nullptr, nullptr, nullptr); - - object_template tmpl; - cur = parse_template(cur, end, tmpl); - - //TODO parse_object should return empty list when there are no objects - if (std::distance( cur, end ) > 0) { - auto objs = parse_objects(tmpl, this->type, cur, end); - this->objs = objs; - } + const char* cur = this->record.data.data(); + /* As cursor value is not stored, read data again to get the position */ + cur = parse_set_component(cur); + cur = parse_template(cur); + parse_objects(cur); - this->tmpl = tmpl; this->parsed = true; } From 2e03f624dcf5b588b4f953bd49ff9a735479cfad Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Tue, 22 Sep 2020 13:00:50 +0200 Subject: [PATCH 02/25] Add missing physical layout tests --- python/data/chap2/README.rst | 27 ++++++++++++--- python/data/chap2/lf-lrs-inconsistency.dlis | Bin 0 -> 416 bytes .../data/chap2/lf-truncated-after-lrsh.dlis | Bin 0 -> 296 bytes python/data/chap2/truncated-after-lrsh.dlis | Bin 0 -> 188 bytes python/data/chap2/truncated-in-iflr.dlis | Bin 0 -> 415 bytes .../data/chap2/truncated-in-lrsh-vr-over.dlis | Bin 0 -> 186 bytes ...rs.dlis => truncated-lr-no-lrs-in-vr.dlis} | Bin .../chap2/truncated-lr-no-lrs-vr-over.dlis | Bin 0 -> 226 bytes python/tests/test_physical_layout.py | 31 ++++++++++++++++-- 9 files changed, 52 insertions(+), 6 deletions(-) create mode 100644 python/data/chap2/lf-lrs-inconsistency.dlis create mode 100644 python/data/chap2/lf-truncated-after-lrsh.dlis create mode 100644 python/data/chap2/truncated-after-lrsh.dlis create mode 100644 python/data/chap2/truncated-in-iflr.dlis create mode 100644 python/data/chap2/truncated-in-lrsh-vr-over.dlis rename python/data/chap2/{truncated-on-lrs.dlis => truncated-lr-no-lrs-in-vr.dlis} (100%) create mode 100644 python/data/chap2/truncated-lr-no-lrs-vr-over.dlis diff --git a/python/data/chap2/README.rst b/python/data/chap2/README.rst index b94e01884..95dd457f5 100644 --- a/python/data/chap2/README.rst +++ b/python/data/chap2/README.rst @@ -30,6 +30,13 @@ incomplete-sul.dlis Incomplete storage unit label incomplete-vr.dlis Incomplete visible record + +lf-lrs-inconsistency.dlis First implicit lrs expects successor, but + what follows is a new Logical File + +lf-truncated-after-lrsh.dlis First LF is ok, second is truncated after + File Header lrs + nondlis.txt Not a dlis file old-vr.dlis Older visible record, see 2.3.6.2 Format @@ -59,16 +66,28 @@ small.dlis Test file smaller than 4kB too-small-record.dlis Visible record is smaller than the minimum requirement (20Bytes) +truncated-after-lrsh.dlis File is truncated after LRSH which expects + successor + +truncated-in-iflr.dlis Truncated one byte before expected end of + long IFLR + truncated-in-lrsh.dlis File is truncated in the LRSH -truncated-on-full-lr.dlis File is truncated on a complete LR, but not - VR +truncated-in-lrsh-vr-over.dlis File is truncated in the LRSH, VR end aligns + with EOF. Extremely unlikely situation. truncated-in-second-lr.dlis Mismatch between visible record length and remaining bytes. Second LR truncated -truncated-on-lrs.dlis LRS expects successor, but none comes. File - is truncated +truncated-lr-no-lrs-in-vr.dlis LRS expects successor, but none comes. File + is truncated. VR is also truncated + +truncated-lr-no-lrs-vr-over.dlis LRS expects successor, but none comes. File + is truncated. LRS and VR ends align + +truncated-on-full-lr.dlis File is truncated on a complete LR, but not + VR wrong-lrhs.dlis Mismatch between logical record segment length and remaining bytes in visible diff --git a/python/data/chap2/lf-lrs-inconsistency.dlis b/python/data/chap2/lf-lrs-inconsistency.dlis new file mode 100644 index 0000000000000000000000000000000000000000..742ff87171c33d60bd7ae70d128bbc2c89ea11a0 GIT binary patch literal 416 zcmbV|!AiqG7=*Vywg)eLfawENlF~yx)@H?kMop;TaozrD7B+6!{bLb4^bz|gzJczk z;H^5RVFm`i0gyhYmq}8}^+TCqnO@B^s#R|p6+5qXgo+sFN*uoj;`1QB#y9z$oZZS* zCdWHHRMNjDOMVmdlaFQ+RYxRzku&S+&(8%wA9EF&H<%-hZy zZPADFo;7+B5dXTD&JZnTA>qk=+l zer~ElPJVi3a$=4`T4qivT$(5bLjVJlXKq1GW^!gpPNhOxeo<~>Nl9voLQ!gReo+ch MW*{qKXn^+J91%FkrL8tUg65^T!p;t5n843rRJEMQ<@2?25y85kH18FUsf0|4(*Dp>#k literal 0 HcmV?d00001 diff --git a/python/data/chap2/truncated-in-iflr.dlis b/python/data/chap2/truncated-in-iflr.dlis new file mode 100644 index 0000000000000000000000000000000000000000..62fd2e96cc7ccc6ca20a15ce2d439613089269bd GIT binary patch literal 415 zcmY#TP%sQL)H5&$a&`6(a#64_v@}vs2rkJlN=#2xC{8U=D9Yx^6*^zOD+6c_|7RsYR&@>G`R}3ShHF Ji9{j-0suc&hMfQa literal 0 HcmV?d00001 diff --git a/python/data/chap2/truncated-in-lrsh-vr-over.dlis b/python/data/chap2/truncated-in-lrsh-vr-over.dlis new file mode 100644 index 0000000000000000000000000000000000000000..957a27b7bcbf63764891d5ddcb9c67cea9918c32 GIT binary patch literal 186 zcma*gK?=e^3 Date: Wed, 18 Nov 2020 13:11:06 +0100 Subject: [PATCH 03/25] Fail truncated files in findoffsets There were several places where truncated files could fail, but as truncated files could be detected on markers level, we should inform user about that early. --- lib/src/io.cpp | 37 +++++++++++++++++++++++++++- python/tests/test_physical_layout.py | 14 +++++------ python/tests/test_tif.py | 2 +- 3 files changed, 44 insertions(+), 9 deletions(-) diff --git a/lib/src/io.cpp b/lib/src/io.cpp index a4eb787ee..0870106fc 100644 --- a/lib/src/io.cpp +++ b/lib/src/io.cpp @@ -210,6 +210,23 @@ noexcept (false) { } } +void check_for_truncation(dl::stream& file, std::int64_t offset) { + file.seek(offset - 1); + char tmp; + /* + * lfp returns UNEXPECTED_EOF for cfile when truncation happens + * inside of declared data + * TODO: if dlisio will support other types of io, this might have + * to be reconsidered + */ + try { + file.read(&tmp, 1); + } catch (const std::runtime_error& e) { + throw std::runtime_error("findoffsets: file truncated"); + } + return; +} + } stream::stream( lfp_protocol* f ) noexcept (false){ @@ -392,14 +409,22 @@ stream_offsets findoffsets( dl::stream& file) noexcept (false) { stream_offsets ofs; std::int64_t offset = 0; + bool has_successor = false; char buffer[ DLIS_LRSH_SIZE ]; int len = 0; while (true) { file.seek(offset); file.read(buffer, DLIS_LRSH_SIZE); - if (file.eof()) + if (file.eof()) { + check_for_truncation(file, offset); + if (has_successor) { + auto msg = "File is over, but last logical record segment " + "expects successor"; + throw std::runtime_error(msg); + } break; + } int type; std::uint8_t attrs; @@ -420,6 +445,14 @@ stream_offsets findoffsets( dl::stream& file) noexcept (false) { * this LR and all successive LR's until we encounter another * FILE-HEADER. */ + check_for_truncation(file, offset); + + if (has_successor) { + auto msg = "New logical file appears, but previous logical " + "record segment expects successor"; + throw std::runtime_error(msg); + } + file.seek( offset ); break; } @@ -432,6 +465,8 @@ stream_offsets findoffsets( dl::stream& file) noexcept (false) { */ else ofs.implicits.push_back( offset ); } + + has_successor = attrs & DLIS_SEGATTR_SUCCSEG; offset += len; } return ofs; diff --git a/python/tests/test_physical_layout.py b/python/tests/test_physical_layout.py index cc17f0167..6c2ecb15f 100644 --- a/python/tests/test_physical_layout.py +++ b/python/tests/test_physical_layout.py @@ -42,11 +42,11 @@ def test_lrs_atributes_inconsistency(): _ = dlisio.load('data/chap2/attrs-inconsistency-type-pred.dlis') assert "inconsistency" in str(excinfo.value) -@pytest.mark.xfail(reason="not implemented", strict=True) def test_logical_file_lrs_inconsistency(): with pytest.raises(RuntimeError) as excinfo: - _ = dlisio.load('data/chap2/lf_lrs_inconsistency.dlis') - assert "inconsistency" in str(excinfo.value) + _ = dlisio.load('data/chap2/lf-lrs-inconsistency.dlis') + msg = "file appears, but previous logical record segment expects successor" + assert msg in str(excinfo.value) def test_padbytes_as_large_as_record(): path = 'data/chap2/padbytes-large-as-record.dlis' @@ -97,7 +97,6 @@ def test_broken_vr(): _ = dlisio.load('data/chap2/incomplete-vr.dlis') assert "file may be corrupted" in str(excinfo.value) -@pytest.mark.xfail(reason="should raise", strict=True) def test_truncated_in_iflr(): with pytest.raises(RuntimeError) as excinfo: _ = dlisio.load('data/chap2/truncated-in-iflr.dlis') @@ -106,7 +105,7 @@ def test_truncated_in_iflr(): def test_truncated_in_second_lr(): with pytest.raises(RuntimeError) as excinfo: _ = dlisio.load('data/chap2/truncated-in-second-lr.dlis') - assert "unexpected EOF when reading record" in str(excinfo.value) + assert "findoffsets: file truncated" in str(excinfo.value) def test_truncated_in_lrsh(): with pytest.raises(RuntimeError) as excinfo: @@ -121,7 +120,7 @@ def test_truncated_in_lrsh_vr_over(): def test_truncated_after_lrsh(): with pytest.raises(RuntimeError) as excinfo: _ = dlisio.load('data/chap2/truncated-after-lrsh.dlis') - assert "unexpected EOF when reading record" in str(excinfo.value) + assert "findoffsets: file truncated" in str(excinfo.value) def test_truncated_lr_missing_lrs_in_vr(): with pytest.raises(RuntimeError) as excinfo: @@ -131,7 +130,8 @@ def test_truncated_lr_missing_lrs_in_vr(): def test_truncated_lr_missing_lrs_vr_over(): with pytest.raises(RuntimeError) as excinfo: _ = dlisio.load('data/chap2/truncated-lr-no-lrs-vr-over.dlis') - assert "extract: unable to read LRSH" in str(excinfo.value) + msg = "last logical record segment expects successor" + assert msg in str(excinfo.value) def test_truncated_on_full_lr(): with pytest.raises(RuntimeError) as excinfo: diff --git a/python/tests/test_tif.py b/python/tests/test_tif.py index 384e81fda..08006712b 100644 --- a/python/tests/test_tif.py +++ b/python/tests/test_tif.py @@ -96,7 +96,7 @@ def test_layout_truncated_in_data(): fpath = 'data/tif/layout/truncated-in-data.dlis' with pytest.raises(RuntimeError) as excinfo: _ = dlisio.load(fpath) - assert "unexpected EOF" in str(excinfo.value) + assert "findoffsets: file truncated" in str(excinfo.value) def test_layout_truncated_in_header(): fpath = 'data/tif/layout/truncated-in-header.dlis' From 2b129df50134594f390010b56996401f0cbe7e4c Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Thu, 15 Oct 2020 18:36:54 +0200 Subject: [PATCH 04/25] Introduce dlis_error and error_severity dlis_error and error_severity are two beings that we would need later when creating our error-mechanism. dlis_error represents error one way or another related to protocol processing. error_severity is a field in dlis error which shows how critical for further and correct processing of data is this error according to developers. --- lib/extension/dlisio/ext/types.hpp | 25 +++++++++++++++++++++++++ python/dlisio/ext/core.cpp | 14 ++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index dc73ee6d6..3f3a68840 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -390,6 +390,31 @@ using value_vector = mpark::variant< std::vector< units > >; +/* + * Assigned error severity. + */ +enum class error_severity { + INFO = 1, // everything seems fine, but situation itself is not typical + MINOR = 2, // contradicts specification, but recovery is most likely ok + MAJOR = 3, // contradicts specification, not sure if recovery is ok + CRITICAL = 4, // broken beyond repair, could not recover +}; + +/* + * Error which is caused by violations with regards to rp66 protocol. + * + * It can be classified by us in different severity levels. In some situations + * we might know that violation is common and have a good idea how to recover. + * In other situations violation might be too severe for us to do anything about + * it. + */ +struct dlis_error { + error_severity severity; + std::string problem; + std::string specification; + std::string action; +}; + struct record { bool isexplicit() const noexcept (true); bool isencrypted() const noexcept (true); diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index 3f1cfc00c..d9f1b99c0 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -1000,6 +1000,20 @@ PYBIND11_MODULE(core, m) { return py::make_tuple( ofs.explicits, ofs.implicits ); }); + py::enum_< dl::error_severity >( m, "error_severity" ) + .value( "info", dl::error_severity::INFO ) + .value( "minor", dl::error_severity::MINOR ) + .value( "major", dl::error_severity::MAJOR ) + .value( "critical", dl::error_severity::CRITICAL ) + ; + + py::class_< dl::dlis_error >( m, "dlis_error" ) + .def_readonly( "severity", &dl::dlis_error::severity ) + .def_readonly( "problem", &dl::dlis_error::problem ) + .def_readonly( "specification", &dl::dlis_error::specification ) + .def_readonly( "action", &dl::dlis_error::action ) + ; + m.def("set_encodings", set_encodings); m.def("get_encodings", get_encodings); From 6a556bd708d2bde42101bc98fb147d6654eebb75 Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Thu, 5 Nov 2020 13:01:21 +0100 Subject: [PATCH 05/25] Add error_handler As we plan to finally give users all the warnings we see during operations in C++, we need to create an error handler class familiar both to C++ and to Python. We'll do it similar way to the PyMatcher. While logger conceptually could be a global property (like encodings), decision was made to avoid global settings and make logger a dlis attribute. It's likely to stay the same for various files, but now user is in control. --- lib/extension/dlisio/ext/types.hpp | 9 ++ python/dlisio/__init__.py | 1 + python/dlisio/errors/__init__.py | 1 + python/dlisio/errors/errorhandler.py | 206 +++++++++++++++++++++++++++ python/dlisio/ext/core.cpp | 36 ++++- python/dlisio/file.py | 3 +- python/dlisio/load.py | 13 +- python/docs/dlisio.rst | 5 + python/setup.py | 2 +- 9 files changed, 269 insertions(+), 7 deletions(-) create mode 100644 python/dlisio/errors/__init__.py create mode 100644 python/dlisio/errors/errorhandler.py diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index 3f3a68840..92bae4bfd 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -415,6 +415,15 @@ struct dlis_error { std::string action; }; +struct error_handler { + virtual void log(const error_severity &level, const std::string &context, + const std::string &problem,const std::string &specification, + const std::string &action) + const noexcept(false) = 0; + + virtual ~error_handler() = default; +}; + struct record { bool isexplicit() const noexcept (true); bool isencrypted() const noexcept (true); diff --git a/python/dlisio/__init__.py b/python/dlisio/__init__.py index 340c68247..a0ecd9e2b 100644 --- a/python/dlisio/__init__.py +++ b/python/dlisio/__init__.py @@ -1,5 +1,6 @@ from . import core from . import plumbing +from . import errors from .settings import get_encodings, set_encodings from .file import physicalfile, logicalfile from .load import open, load diff --git a/python/dlisio/errors/__init__.py b/python/dlisio/errors/__init__.py new file mode 100644 index 000000000..1ee5b32c0 --- /dev/null +++ b/python/dlisio/errors/__init__.py @@ -0,0 +1 @@ +from .errorhandler import * diff --git a/python/dlisio/errors/errorhandler.py b/python/dlisio/errors/errorhandler.py new file mode 100644 index 000000000..682ab0b0c --- /dev/null +++ b/python/dlisio/errors/errorhandler.py @@ -0,0 +1,206 @@ +import logging +from enum import Enum + +from dlisio import core + +class Actions(Enum): + """ + Actions available for various specification violations + """ + LOG_DEBUG = lambda msg: logging.debug(msg) + """logging.debug""" + LOG_INFO = lambda msg: logging.info(msg) + """logging.info""" + LOG_WARNING = lambda msg: logging.warning(msg) + """logging.warning""" + LOG_ERROR = lambda msg: logging.error(msg) + """logging.error""" + RAISE = lambda msg: ErrorHandler.raise_msg(msg) + """raise RuntimeError""" + SWALLOW = lambda msg: ErrorHandler.swallow(msg) + """pass""" + + def __init__(self, handle): + self.handle = handle + +class ErrorHandler(core.error_handler): + """ Defines rules about error handling + + Many .dlis files happen to be not compliant with specification or simply + broken. This class gives user some control over handling of such files. + + When dlisio encounters a specification violation, it categories the issue + based on the severity of the violation. Some issues are easy to ignore + while other might force dlisio to give up on its current task. + ErrorHandler supplies an interface for changing how dlisio reacts to + different violation in the file. + + Different categories are *info*, *minor*, *major* and *critical*: + + ======== =================================================================== + Severity Description + ======== =================================================================== + critical Any issue that forces dlisio stop its current objective prematurely + is categorised as critical. + + By default a critical error raises a RuntimeError. + + An example would be file indexing, which happens at load. Suppose + the indexing fails midways through the file. There is no way for + dlisio to reliably keep indexing the file. + However, it is likely that the file is readable up until the point + of failure. Changing the behaviour of critical from raising an + Exception to logging would in this case mean that a partially + indexed file is returned by load. + major Result of a direct specification violation in the file. dlisio + makes an assumption about what broken information [1] should have + been and continues parsing the file on this assumption. + If no other major or critical issues are reported, it's likely that + assumption was correct and that dlisio parsed the file correctly. + However, no guarantees can be made. + + By default a warning is logged. + + [1] Note that "information" in this case refers to the data in the + file that tells dlisio how the file should be parsed, not to the + actual parsed data. + + minor Like Major issues, this is also a result of a direct specification + violation. dlisio makes similar assumptions to keep parsing the + file. Minor issues are generally less severe and, in contrast to + major issues, are more likely to be handled correctly. + However, still no guarantees can be made about the parsed data. + + By default an info message is logged. + info Issue doesn't contradict specification, but situation is peculiar. + + By default a debug message is logged. + + ======== =================================================================== + + ErrorHandler only applies to issues related to parsing information from the + file. These are issues that otherwise would force dlisio to fail, such as + direct violations of the RP66v1 specification. + It does not apply to inconsistencies and issues in the parsed data. + This means that cases where dlisio enforces behaviour of the parsed data, + such as object-to-object references, are out of scope for the ErrorHandler. + + Please also note that ErrorHandler doesn't redefine issues categories, it + only changes default behavior. + + Attributes + ---------- + info: + Action for merely information message + + minor: + Action for minor specification violation + + major: + Action for major specification violation + + critical: + Action for critical specification violation + + + Warnings + -------- + Escaping errors is a good solution when user needs to read as much data as + possible, for example, to have a general overview over the file. However + user must be careful when using this mode during close inspection. + If user decides to accept errors, they must be aware that some returned data + will be spoiled. Most likely it will be data which is stored in the file + near the failure. + + Warnings + -------- + Be careful not to ignore too much information when investigating files. + If you want to debug a broken part of the file, you should look at all + issues to get a full picture of the situation. + + Examples + -------- + Define your own rules: + + >>> from dlisio.errors import ErrorHandler, Actions + >>> def myhandler(msg): + ... raise RuntimeError("Custom handler: " + msg) + >>> errorhandler = ErrorHandler( + ... info = Actions.SWALLOW, + ... minor = Actions.LOG_WARNING, + ... major = Actions.RAISE, + ... critical = myhandler) + + Parse a file: + + >>> files = dlisio.load(path) + RuntimeError: "...." + >>> handler = ErrorHandler(critical=Actions.LOG_ERROR) + >>> files = dlisio.load(path, errorhandler=handler) + [ERROR] "...." + >>> for f in files: + ... pass + + """ + @staticmethod + def raise_msg(msg): + raise RuntimeError(msg) + + @staticmethod + def swallow(msg): + pass + + def __init__(self, info = Actions.LOG_DEBUG, + minor = Actions.LOG_INFO, + major = Actions.LOG_WARNING, + critical = Actions.RAISE): + + core.error_handler.__init__(self) + + self.info = info + self.minor = minor + self.major = major + self.critical = critical + + def log(self, severity, context, problem, spec, action): + msg = ErrorHandler.format_error( + severity, context, problem, spec, action) + + """ Overrides dl::error_handler::log """ + if severity == core.error_severity.info: handler = self.info + elif severity == core.error_severity.minor: handler = self.minor + elif severity == core.error_severity.major: handler = self.major + elif severity == core.error_severity.critical: handler = self.critical + else: + err='Unknown error severity. Original msg was: {}' + raise RuntimeError(err.format(msg)) + + handler(msg) + + @staticmethod + def format_severity(severity): + if severity == core.error_severity.info: return "info" + elif severity == core.error_severity.minor: return "minor" + elif severity == core.error_severity.major: return "major" + elif severity == core.error_severity.critical: return "critical" + else: return severity + + # TODO: check if that helps for named fileheader issue + # TODO: nested format + @staticmethod + def format_error(severity, context, problem, spec, action): + format = '\n{:<{align}} {}' + severity = ErrorHandler.format_severity(severity) + + problem = format.format('Problem:', problem, align=13) + context = format.format('Where:', context, align=13) + severity = format.format('Severity:', severity, align=13) + + if spec: + spec = format.format('RP66V1 ref:', spec, align=13) + + if action: + action = format.format('Action taken:', action, align=13) + + msg = "{}{}{}{}{}" + return msg.format(problem, context, severity, spec, action) diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index d9f1b99c0..a7e6639ea 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -717,6 +717,30 @@ class Pymatcher : public dl::matcher { } }; +/** trampoline helper class for dl::error_handler bindings */ +class PyErrorHandler : public dl::error_handler { +public: + /* Inherit the constructor */ + using dl::error_handler::error_handler; + + /* Trampoline (need one for each virtual function) */ + void log(const dl::error_severity& level, const std::string& context, + const std::string& problem, const std::string& specification, + const std::string& action) + const noexcept(false) override { + PYBIND11_OVERLOAD_PURE( + void, /* Return type */ + dl::error_handler, /* Parent class */ + log, /* Name of function in C++ */ + level, /* Argument(s) */ + context, + problem, + specification, + action + ); + } +}; + } PYBIND11_MAKE_OPAQUE( std::vector< dl::object_set > ) @@ -1014,10 +1038,16 @@ PYBIND11_MODULE(core, m) { .def_readonly( "action", &dl::dlis_error::action ) ; - m.def("set_encodings", set_encodings); - m.def("get_encodings", get_encodings); - py::class_< dl::matcher, Pymatcher >( m, "matcher") .def(py::init<>()) ; + + py::class_< dl::error_handler, PyErrorHandler >( m, "error_handler") + .def(py::init<>()) + ; + + /* settings */ + m.def("set_encodings", set_encodings); + m.def("get_encodings", get_encodings); + } diff --git a/python/dlisio/file.py b/python/dlisio/file.py index 313344319..de0300d41 100644 --- a/python/dlisio/file.py +++ b/python/dlisio/file.py @@ -95,11 +95,12 @@ class logicalfile(object): then the users responsibility of ensuring correctness for the custom class. """ - def __init__(self, stream, object_pool, fdata_index, sul=None): + def __init__(self, stream, object_pool, fdata_index, sul, error_handler): self.file = stream self.object_pool = object_pool self.sul = sul self.fdata_index = fdata_index + self.error_handler = error_handler if 'UPDATE' in self.object_pool.types: msg = ('{} contains UPDATE-object(s) which changes other ' diff --git a/python/dlisio/load.py b/python/dlisio/load.py index bcab6713d..22e6fcba6 100644 --- a/python/dlisio/load.py +++ b/python/dlisio/load.py @@ -1,5 +1,6 @@ from . import core from .file import physicalfile, logicalfile +from .errors import ErrorHandler def open(path): """ Open a file @@ -21,7 +22,7 @@ def open(path): """ return core.open(str(path)) -def load(path): +def load(path, error_handler = None): """ Loads a file and returns one filehandle pr logical file. The dlis standard have a concept of logical files. A logical file is a @@ -48,6 +49,11 @@ def load(path): path : str_like + error_handler : dlisio.errors.ErrorHandler, optional + Error handling rules. Default rules will apply if none supplied. + Handler will be added to all the logical files, so users may modify + the behavior at any time. + Examples -------- @@ -75,6 +81,9 @@ def load(path): dlis : dlisio.physicalfile(dlisio.logicalfile) """ + if not error_handler: + error_handler = ErrorHandler() + sulsize = 80 tifsize = 12 lfs = [] @@ -125,7 +134,7 @@ def rewind(offset, tif): pool = core.pool(sets) fdata = core.findfdata(stream, implicits) - lf = logicalfile(stream, pool, fdata, sul) + lf = logicalfile(stream, pool, fdata, sul, error_handler) lfs.append(lf) try: diff --git a/python/docs/dlisio.rst b/python/docs/dlisio.rst index 4deeaa397..8ed91725f 100644 --- a/python/docs/dlisio.rst +++ b/python/docs/dlisio.rst @@ -3,6 +3,11 @@ Strings and encodings .. autofunction:: dlisio.set_encodings .. autofunction:: dlisio.get_encodings +Error handling +============== +.. autoclass:: dlisio.errors.ErrorHandler() +.. autoclass:: dlisio.errors.Actions() + Open and Load ============= .. autofunction:: dlisio.load diff --git a/python/setup.py b/python/setup.py index 469ce75d8..a335a2201 100755 --- a/python/setup.py +++ b/python/setup.py @@ -52,7 +52,7 @@ def getversion(): description = 'DLIS v1', long_description = 'DLIS v1', url = 'https://github.com/equinor/dlisio', - packages = ['dlisio', 'dlisio.plumbing'], + packages = ['dlisio', 'dlisio.plumbing', 'dlisio.errors'], license = 'LGPL-3.0', platforms = 'any', install_requires = ['numpy'], From 4da7baa8a2dee548e741930848aeb8baa6e5700b Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Thu, 5 Nov 2020 16:35:30 +0100 Subject: [PATCH 06/25] Propagate attribute warnings to Python MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introducing system that will send debug/info/warning information that appears during attributes parsing to the user. Co-authored-by: Erlend Hårstad --- lib/extension/dlisio/ext/types.hpp | 19 +++++++ lib/src/parse.cpp | 73 +++++++++++++++++---------- python/dlisio/ext/core.cpp | 1 + python/dlisio/plumbing/basicobject.py | 12 ++++- python/tests/test_logical_record.py | 24 ++++++--- 5 files changed, 94 insertions(+), 35 deletions(-) diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index 92bae4bfd..72a0ba322 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -436,6 +436,23 @@ struct record { /* * The structure of an attribute as described in 3.2.2.1 + * + * Error handling: + * + * Due to the intricate structure of a dlis-file, dlisio typically over-parses + * when a certain piece of information is queried. This would typically make + * any warnings or errors issued from the parsing routines pre-mature and might + * result in the query failing due to an error in some (from a + * user-perspective) unrelated data. + * + * To combat this, the result of parsing routines (and the state of the + * parsed object) is communicated through error codes set on the object. + * + * It is the consumers responsibility to check the state of the + * object before using its content. + * + * object_attribute.log contains a list of dl::dlis_error, which provide + * human-readable explanation of the problem */ struct object_attribute { dl::ident label = {}; @@ -446,6 +463,8 @@ struct object_attribute { dl::value_vector value = {}; bool invariant = false; + std::vector< dl::dlis_error > log; + bool operator == (const object_attribute& ) const noexcept (true); }; diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index 79ca52ea4..f237f21c1 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -483,12 +483,10 @@ std::vector< T >& reset( dl::value_vector& value ) noexcept (false) { return value.emplace< std::vector< T > >(); } -const char* elements( const char* xs, - dl::uvari count, - dl::representation_code reprc, - dl::value_vector& vec ) { - - const auto n = dl::decay( count ); +const char* elements( const char* xs, dl::object_attribute& attr ) { + const auto reprc = attr.reprc; + dl::value_vector& vec = attr.value; + const auto n = dl::decay( attr.count ); if (n == 0) { vec = mpark::monostate{}; @@ -722,9 +720,7 @@ const char* object_set::parse_template(const char* cur) noexcept (false) { if (flags.count) cur = cast( cur, attr.count ); if (flags.reprc) cur = cast( cur, attr.reprc ); if (flags.units) cur = cast( cur, attr.units ); - if (flags.value) cur = elements( cur, attr.count, - attr.reprc, - attr.value ); + if (flags.value) cur = elements( cur, attr ); attr.invariant = flags.invariant; tmp.push_back( std::move( attr ) ); @@ -775,7 +771,8 @@ struct shrink { void patch_missing_value( dl::value_vector& value, std::size_t count, - dl::representation_code reprc ) + dl::representation_code reprc, + std::vector< dl::dlis_error >& log) noexcept (false) { /* @@ -789,7 +786,21 @@ noexcept (false) /* smaller, shrink and all is fine */ if (size > count) { + + const auto msg = + "template value is not overridden by object attribute, but " + "count is. count ({}) < template count ({})"; + mpark::visit( shrink( count ), value ); + dlis_error err { + dl::error_severity::MAJOR, + fmt::format(msg, count, size), + "3.2.2.1 Component Descriptor: The number of Elements that " + "make up the Value is specified by the Count " + "Characteristic.", + "shrank template value to new attribute count" + }; + log.push_back(err); return; } @@ -892,28 +903,31 @@ const char* object_set::parse_objects(const char* cur) noexcept (false) { } if (flags.invariant) { - /* - * 3.2.2.2 Component Usage - * Invariant Attribute Components, which may only appear in - * the Template [...] - * - * Assume this is a mistake, assume it was a regular - * non-invariant attribute - */ - user_warning("ATTRIB:invariant in attribute, " - "but should only be in template"); + dlis_error err { + dl::error_severity::MAJOR, + "Invariant attribute in object attributes", + "3.2.2.2 Component Usage: Invariant Attribute Components, " + "which may only appear in the Template [...]", + "ignored invariant bit, assumed that attribute followed" + }; + attr.log.push_back(err); } if (flags.label) { - user_warning( "ATTRIB:label set, but must be null"); + dlis_error err { + dl::error_severity::MAJOR, + "Label bit set in object attribute", + "3.2.2.2 Component Usage: Attribute Components that follow " + "Object Components must not have Attribute Labels", + "ignored label bit, assumed that label never followed" + }; + attr.log.push_back(err); } if (flags.count) cur = cast( cur, attr.count ); if (flags.reprc) cur = cast( cur, attr.reprc ); if (flags.units) cur = cast( cur, attr.units ); - if (flags.value) cur = elements( cur, attr.count, - attr.reprc, - attr.value ); + if (flags.value) cur = elements( cur, attr ); const auto count = dl::decay( attr.count ); @@ -943,11 +957,18 @@ const char* object_set::parse_objects(const char* cur) noexcept (false) { const auto msg = "count ({}) isn't 0 and representation " "code ({}) changed, but value is not explicitly set"; const auto code = static_cast< int >(attr.reprc); - user_warning(fmt::format(msg, count, code)); + dlis_error err { + dl::error_severity::MAJOR, + fmt::format(msg, count, code), + "", + "value defaulted based on representation code from " + "attribute" + }; + attr.log.push_back(err); attr.value = mpark::monostate{}; } - patch_missing_value( attr.value, count, attr.reprc ); + patch_missing_value( attr.value, count, attr.reprc, attr.log); } current.set(attr); diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index a7e6639ea..469d22abc 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -876,6 +876,7 @@ PYBIND11_MODULE(core, m) { py::class_< dl::object_attribute >( m, "object_attribute" ) .def_readonly("value", &dl::object_attribute::value) .def_readonly("units", &dl::object_attribute::units) + .def_readonly("log", &dl::object_attribute::log) ; py::class_< dl::basic_object >( m, "basic_object" ) diff --git a/python/dlisio/plumbing/basicobject.py b/python/dlisio/plumbing/basicobject.py index 5adb2a93a..d45eddc58 100644 --- a/python/dlisio/plumbing/basicobject.py +++ b/python/dlisio/plumbing/basicobject.py @@ -197,10 +197,20 @@ def __getitem__(self, key): parse_as = vector try: - rp66value = self.attic[key].value + attribute = self.attic[key] except KeyError: return defaultvalue(parse_as) + rp66value = attribute.value + + # already report errors. presence of key in the attic is enough + if len(attribute.log) > 0: + context = "{}-A.{}".format(self.fingerprint, key) + for error in attribute.log: + self.logicalfile.error_handler.log( + error.severity, context, error.problem, + error.specification, error.action) + if rp66value is None: return defaultvalue(parse_as) if rp66value == [] : return defaultvalue(parse_as) diff --git a/python/tests/test_logical_record.py b/python/tests/test_logical_record.py index a9355ca3d..9740f09a3 100644 --- a/python/tests/test_logical_record.py +++ b/python/tests/test_logical_record.py @@ -63,8 +63,7 @@ def test_invariant_attribute(tmpdir, merge_files_oneLR): assert attr.value == [False, False, True] -@pytest.mark.future_warning_invariant_attribute -def test_invariant_attribute_in_object(tmpdir, merge_files_oneLR): +def test_invariant_attribute_in_object(tmpdir, merge_files_oneLR, assert_log): path = os.path.join(str(tmpdir), 'invariant-attribute-in-object.dlis') content = [ 'data/chap3/start.dlis.part', @@ -79,6 +78,9 @@ def test_invariant_attribute_in_object(tmpdir, merge_files_oneLR): attr = obj.attic['DEFAULT_ATTRIBUTE'] assert attr.value == [8.0] + _ = obj['DEFAULT_ATTRIBUTE'] + assert_log("Invariant attribute") + def test_absent_attribute(tmpdir, merge_files_oneLR): path = os.path.join(str(tmpdir), 'attribute_absent.dlis') @@ -301,8 +303,7 @@ def test_repcode_invalid_in_objects(tmpdir, merge_files_oneLR): f.load() assert "unknown representation code" in str(excinfo.value) -@pytest.mark.future_warning_repcode_different_no_value -def test_repcode_different_no_value(tmpdir, merge_files_oneLR): +def test_repcode_different_no_value(tmpdir, merge_files_oneLR, assert_log): path = os.path.join(str(tmpdir), 'different-repcode-no-value.dlis') content = [ 'data/chap3/start.dlis.part', @@ -314,7 +315,10 @@ def test_repcode_different_no_value(tmpdir, merge_files_oneLR): with dlisio.load(path) as (f, *_): obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) - assert obj.attic['DEFAULT_ATTRIBUTE'].value == [0j, 0j] + + attr = obj['DEFAULT_ATTRIBUTE'] + assert attr == [0j, 0j] + assert_log("value is not explicitly set") @pytest.mark.future_test_attributes def test_count0_novalue(tmpdir, merge_files_oneLR): @@ -371,8 +375,7 @@ def test_count0_different_repcode(tmpdir, merge_files_oneLR): assert attr.value == None -@pytest.mark.future_warning_label_bit_set_in_object_attr -def test_label_bit_set_in_attribute(tmpdir, merge_files_oneLR): +def test_label_bit_set_in_attribute(tmpdir, merge_files_oneLR, assert_log): path = os.path.join(str(tmpdir), 'label_bit_set_in_attribute.dlis') content = [ 'data/chap3/start.dlis.part', @@ -386,6 +389,8 @@ def test_label_bit_set_in_attribute(tmpdir, merge_files_oneLR): obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) assert obj.attic['DEFAULT_ATTRIBUTE'].value + _ = obj['DEFAULT_ATTRIBUTE'] + assert_log("Label bit") @pytest.mark.future_warning_label_bit_not_set_in_template @pytest.mark.not_implemented_datetime_timezone @@ -437,7 +442,7 @@ def test_object_name_bit_not_set_in_object(tmpdir, merge_files_oneLR): @pytest.mark.future_test_attributes -def test_novalue_less_count(tmpdir, merge_files_oneLR): +def test_novalue_less_count(tmpdir, merge_files_oneLR, assert_log): path = os.path.join(str(tmpdir), 'novalue-less-count.dlis') content = [ 'data/chap3/start.dlis.part', @@ -455,6 +460,9 @@ def test_novalue_less_count(tmpdir, merge_files_oneLR): assert attr.units == 'default attr units' assert attr.value == [-0.75] + _ = obj['DEFAULT_ATTRIBUTE'] + assert_log("< template count") + @pytest.mark.not_implemented def test_novalue_more_count(tmpdir, merge_files_oneLR): From f9927ce6c64bcd2466b8f3787892b5785c3a63e2 Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Thu, 5 Nov 2020 19:09:17 +0100 Subject: [PATCH 07/25] Sometimes allow invalid representation codes We have several files where - reprcode is invalid in template - attribute descriptor in object attribute is "20" We already allowed invalid representation code in templates (ee1680b22d44fe4f898196d9f6c836e8ec45479b) in hope that they will get overwritten in future. Sometimes they are and sometimes they are not. But we can deal with situation described above fairly well. Attribute is clearly spoiled, but that doesn't prevent us from parsing further. Hence we can just log this error and continue. --- lib/src/parse.cpp | 63 ++++++-- .../reprcode-invalid-no-value.dlis.part | 1 + ....part => reprcode-invalid-value.dlis.part} | 0 python/data/chap3/object/object2.dlis.part | 1 + python/data/chap3/template/attribute2.part | Bin 0 -> 56 bytes python/tests/test_logical_record.py | 136 +++++++++++++++++- 6 files changed, 183 insertions(+), 18 deletions(-) create mode 100644 python/data/chap3/objattr/reprcode-invalid-no-value.dlis.part rename python/data/chap3/objattr/{reprcode-invalid.dlis.part => reprcode-invalid-value.dlis.part} (100%) create mode 100644 python/data/chap3/object/object2.dlis.part create mode 100644 python/data/chap3/template/attribute2.part diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index f237f21c1..1f44709a9 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -454,14 +454,37 @@ const char* cast( const char* xs, xs = cast( xs, x ); if (x < DLIS_FSHORT || x > DLIS_UNITS) { - debug_warning("Read incorrect representation code"); reprc = dl::representation_code::undef; - }else{ + } else { reprc = static_cast< dl::representation_code >( x ); } return xs; } +const char* repcode(const char* xs, dl::object_attribute& attr ) +noexcept (false) { + dl::representation_code& reprc = attr.reprc; + + auto cur = cast(xs, reprc); + if (reprc == dl::representation_code::undef) { + // retrieve value again for reporting, as it is lost + dl::ushort x{ 0 }; + cast( xs, x ); + + const auto msg = "Invalid representation code {}"; + const auto code = static_cast< int >(x); + dl::dlis_error err { + dl::error_severity::MINOR, + fmt::format(msg, code), + "Appendix B: Representation Codes", + "Continue. Postpone dealing with this until later" + }; + attr.log.push_back(err); + } + + return cur; +} + template < typename T > const char* extract( std::vector< T >& vec, std::int32_t count, @@ -718,7 +741,7 @@ const char* object_set::parse_template(const char* cur) noexcept (false) { cur = cast( cur, attr.label ); if (flags.count) cur = cast( cur, attr.count ); - if (flags.reprc) cur = cast( cur, attr.reprc ); + if (flags.reprc) cur = repcode( cur, attr ); if (flags.units) cur = cast( cur, attr.units ); if (flags.value) cur = elements( cur, attr ); attr.invariant = flags.invariant; @@ -769,12 +792,13 @@ struct shrink { } }; -void patch_missing_value( dl::value_vector& value, - std::size_t count, - dl::representation_code reprc, - std::vector< dl::dlis_error >& log) +void patch_missing_value( dl::object_attribute& attr ) noexcept (false) { + const dl::representation_code reprc = attr.reprc; + const std::size_t count = dl::decay( attr.count ); + dl::value_vector& value = attr.value; + /* * value is *NOT* monostate, i.e. there is a default value. if count != * values.size(), resize it. @@ -800,7 +824,7 @@ noexcept (false) "Characteristic.", "shrank template value to new attribute count" }; - log.push_back(err); + attr.log.push_back(err); return; } @@ -854,10 +878,23 @@ noexcept (false) case rpc::status: reset< dl::status >(value).resize(count); return; case rpc::units: reset< dl::units >(value).resize(count); return; default: { - const auto msg = "unable to patch attribute with no value: " - "unknown representation code {}"; + // repcode is incorrect, but value is missing + // hence we can log an error but continue processing + const auto msg = + "invalid representation code {}"; + /* TODO: there is a problem with reporting. If representation code + * is undefined, we define the value to be 66, not the actual value + * that is present in the file + */ const auto code = static_cast< int >(reprc); - throw std::runtime_error(fmt::format(msg, code)); + dl::dlis_error err { + dl::error_severity::CRITICAL, + fmt::format(msg, code), + "Appendix B: Representation Codes", + "attribute value is left as template default. Continue" + }; + attr.log.push_back(err); + } } } @@ -925,7 +962,7 @@ const char* object_set::parse_objects(const char* cur) noexcept (false) { } if (flags.count) cur = cast( cur, attr.count ); - if (flags.reprc) cur = cast( cur, attr.reprc ); + if (flags.reprc) cur = repcode( cur, attr ); if (flags.units) cur = cast( cur, attr.units ); if (flags.value) cur = elements( cur, attr ); @@ -968,7 +1005,7 @@ const char* object_set::parse_objects(const char* cur) noexcept (false) { attr.value = mpark::monostate{}; } - patch_missing_value( attr.value, count, attr.reprc, attr.log); + patch_missing_value( attr ); } current.set(attr); diff --git a/python/data/chap3/objattr/reprcode-invalid-no-value.dlis.part b/python/data/chap3/objattr/reprcode-invalid-no-value.dlis.part new file mode 100644 index 000000000..6613a6877 --- /dev/null +++ b/python/data/chap3/objattr/reprcode-invalid-no-value.dlis.part @@ -0,0 +1 @@ +$ \ No newline at end of file diff --git a/python/data/chap3/objattr/reprcode-invalid.dlis.part b/python/data/chap3/objattr/reprcode-invalid-value.dlis.part similarity index 100% rename from python/data/chap3/objattr/reprcode-invalid.dlis.part rename to python/data/chap3/objattr/reprcode-invalid-value.dlis.part diff --git a/python/data/chap3/object/object2.dlis.part b/python/data/chap3/object/object2.dlis.part new file mode 100644 index 000000000..90011b0e3 --- /dev/null +++ b/python/data/chap3/object/object2.dlis.part @@ -0,0 +1 @@ +pOBJECT2 \ No newline at end of file diff --git a/python/data/chap3/template/attribute2.part b/python/data/chap3/template/attribute2.part new file mode 100644 index 0000000000000000000000000000000000000000..69ab2f71be628c729f5b8e817f7b6db7311d8577 GIT binary patch literal 56 zcmcB)40d()_j56dcMJ&$@^lIfab;o`N=Z#iEX^rVNGvHSQYg*KEGgdqf&mO1R3J0} D%Tf-D literal 0 HcmV?d00001 diff --git a/python/tests/test_logical_record.py b/python/tests/test_logical_record.py index 9740f09a3..e3efe8fb0 100644 --- a/python/tests/test_logical_record.py +++ b/python/tests/test_logical_record.py @@ -257,7 +257,15 @@ def test_repcode_invalid_datetime(tmpdir, merge_files_oneLR): assert "month must be" in str(excinfo.value) def test_repcode_invalid_in_template_value(tmpdir, merge_files_oneLR): - path = os.path.join(str(tmpdir), 'invalid-repcode.dlis') + path = os.path.join(str(tmpdir), 'invalid-repcode-template-value.dlis') + + # template for attribute 'INVALID': + # repcode = 0 # invalid + # value = [bytes] # value is declared to be present + + # attribute 'INVALID' in obj 'OBJECT': + # repcode = USHORT # valid + # value = [1, 2, 3, 4] # valid value content = [ 'data/chap3/start.dlis.part', 'data/chap3/template/invalid-repcode-value.dlis.part', @@ -272,8 +280,17 @@ def test_repcode_invalid_in_template_value(tmpdir, merge_files_oneLR): assert "unknown representation code" in str(excinfo.value) -def test_repcode_invalid_in_template_no_value(tmpdir, merge_files_oneLR): - path = os.path.join(str(tmpdir), 'invalid-repcode-template.dlis') +def test_repcode_invalid_in_template_no_value_fixed(tmpdir, merge_files_oneLR, + assert_info): + path = os.path.join(str(tmpdir), 'invalid-repcode-template-fixed.dlis') + + # template for attribute 'INVALID': + # repcode = 0 # invalid + # value = None # no value + + # attribute 'INVALID' in obj 'OBJECT': + # repcode = USHORT # valid + # value = [1, 2, 3, 4] # valid value content = [ 'data/chap3/start.dlis.part', 'data/chap3/template/invalid-repcode-no-value.dlis.part', @@ -287,14 +304,90 @@ def test_repcode_invalid_in_template_no_value(tmpdir, merge_files_oneLR): attr = obj.attic['INVALID'] assert attr.value == [1, 2, 3, 4] + _ = obj['INVALID'] + assert_info("Invalid representation code 0") + +def test_repcode_invalid_in_template_no_value_not_fixed(tmpdir, + merge_files_oneLR, + assert_info): + path = os.path.join(str(tmpdir), 'invalid-repcode-template-bad.dlis') + + + # template for attribute 'INVALID': + # repcode = 0 # invalid + # value = None # no value + + # attribute 'INVALID' in obj 'OBJECT' is implicitly taken from template + content = [ + 'data/chap3/start.dlis.part', + 'data/chap3/template/invalid-repcode-no-value.dlis.part', + 'data/chap3/object/object.dlis.part', + ] + merge_files_oneLR(path, content) + with dlisio.load(path) as (f, *_): + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) + attr = obj.attic['INVALID'] + # note that behavior is different from the one below + # here we never process the attribute, hence error is not triggered + assert attr.value == None + + _ = obj['INVALID'] + assert_info("Invalid representation code 0") + +def test_repcode_invalid_in_template_no_value_empty(tmpdir, merge_files_oneLR, + assert_log, assert_info): + path = os.path.join(str(tmpdir), 'invalid-repcode-template-bad-empty.dlis') + + # template for attribute 'INVALID': + # repcode = 0 # invalid + # value = None # no value + + # attribute 'INVALID' in obj 'OBJECT' is explicitly taken from template + + # attribute 'INVALID' in obj 'OBJECT2': + # repcode = USHORT # valid + # value = [1, 2, 3, 4] # valid value + content = [ + 'data/chap3/start.dlis.part', + 'data/chap3/template/invalid-repcode-no-value.dlis.part', + 'data/chap3/object/object.dlis.part', + 'data/chap3/objattr/empty.dlis.part', + 'data/chap3/object/object2.dlis.part', + 'data/chap3/objattr/all-set.dlis.part', + ] + merge_files_oneLR(path, content) + with dlisio.load(path) as (f, *_): + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) + with pytest.raises(RuntimeError) as excinfo: + # note that behavior is different from the one above + # by adding "empty", we process the attribute, hence trigger error + _ = obj['INVALID'] + assert "invalid representation code" in str(excinfo.value) + + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT2', 1, 1) + attr = obj.attic['INVALID'] + assert attr.value == [1, 2, 3, 4] + + _ = obj['INVALID'] + assert_info( + "Problem: Invalid representation code 0\n" + "Where: T.VERY_MUCH_TESTY_SET-I.OBJECT2-O.1-C.1-A.INVALID") -def test_repcode_invalid_in_objects(tmpdir, merge_files_oneLR): +def test_repcode_invalid_in_objects_value(tmpdir, merge_files_oneLR): path = os.path.join(str(tmpdir), 'invalid-repcode-object.dlis') + + # template for attribute 'DEFAULT_ATTRIBUTE': + # repcode = FDOUBLE # valid + # value = [value1, value2] # valid + + # attribute 'DEFAULT_ATTRIBUTE' in obj 'OBJECT': + # repcode = 32 # invalid + # value = [bytes] # value is declared to be present content = [ 'data/chap3/start.dlis.part', 'data/chap3/template/default.dlis.part', 'data/chap3/object/object.dlis.part', - 'data/chap3/objattr/reprcode-invalid.dlis.part', + 'data/chap3/objattr/reprcode-invalid-value.dlis.part', ] merge_files_oneLR(path, content) @@ -303,6 +396,39 @@ def test_repcode_invalid_in_objects(tmpdir, merge_files_oneLR): f.load() assert "unknown representation code" in str(excinfo.value) +def test_repcode_invalid_in_objects_no_value(tmpdir, merge_files_oneLR, + assert_log): + path = os.path.join(str(tmpdir), 'invalid-repcode-object-no-value.dlis') + + # template for attribute 'GLOBAL_DEFAULT_ATTRIBUTE' is empty + + # attribute 'GLOBAL_DEFAULT_ATTRIBUTE' in obj 'OBJECT': + # repcode = 32 # invalid + # value = None # no value is present + + # attribute 'GLOBAL_DEFAULT_ATTRIBUTE' in obj 'OBJECT2': + # repcode = USHORT # valid + # value = [1, 2, 3, 4] # valid value + content = [ + 'data/chap3/start.dlis.part', + 'data/chap3/template/global-default.dlis.part', + 'data/chap3/object/object.dlis.part', + 'data/chap3/objattr/reprcode-invalid-no-value.dlis.part', + 'data/chap3/object/object2.dlis.part', + 'data/chap3/objattr/all-set.dlis.part', + ] + merge_files_oneLR(path, content) + with dlisio.load(path) as (f, *_): + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) + with pytest.raises(RuntimeError) as excinfo: + _ = obj['GLOBAL_DEFAULT_ATTRIBUTE'] + assert "invalid representation code" in str(excinfo.value) + assert_log("value is not explicitly set") + + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT2', 1, 1) + attr = obj.attic['GLOBAL_DEFAULT_ATTRIBUTE'] + assert attr.value == [1, 2, 3, 4] + def test_repcode_different_no_value(tmpdir, merge_files_oneLR, assert_log): path = os.path.join(str(tmpdir), 'different-repcode-no-value.dlis') content = [ From 7da826af70db8f603abb3533fa16904ff7a0de5f Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Thu, 15 Oct 2020 12:30:28 +0200 Subject: [PATCH 08/25] Immediately store procesed data in object set With current approach if error happens during object set processing, we lose all the processed template attributes and objects. Storing them the moment we processed them seems like a better approach. Thus if error happens afterwards we are in better situation with regards to debugging, error recovery and feeding users as much data as possible. --- lib/src/parse.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index 1f44709a9..5fc31c67c 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -704,7 +704,6 @@ bool basic_object::operator != (const basic_object& o) const noexcept (true) { const char* object_set::parse_template(const char* cur) noexcept (false) { - object_template tmp; const char* end = this->record.data.data() + this->record.data.size(); while (true) { @@ -713,7 +712,6 @@ const char* object_set::parse_template(const char* cur) noexcept (false) { const auto flags = parse_attribute_descriptor( cur ); if (flags.object) { - swap( tmp, this->tmpl ); return cur; } @@ -746,11 +744,10 @@ const char* object_set::parse_template(const char* cur) noexcept (false) { if (flags.value) cur = elements( cur, attr ); attr.invariant = flags.invariant; - tmp.push_back( std::move( attr ) ); + this->tmpl.push_back( std::move( attr ) ); if (cur == end){ debug_warning("Set contains no objects"); - swap( tmp, this->tmpl ); return cur; } } @@ -904,7 +901,6 @@ noexcept (false) const char* object_set::parse_objects(const char* cur) noexcept (false) { const char* end = this->record.data.data() + this->record.data.size(); - object_vector objs; const auto default_object = defaulted_object( tmpl ); while (cur != end) { @@ -1011,10 +1007,9 @@ const char* object_set::parse_objects(const char* cur) noexcept (false) { current.set(attr); } - objs.push_back( std::move( current ) ); + this->objs.push_back( std::move( current ) ); } - this->objs = objs; return cur; } From 1c34cdd9b3d77076b454a77de84142773360118d Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Thu, 5 Nov 2020 19:06:27 +0100 Subject: [PATCH 09/25] Inform objects about problems in attributes When something unexpected happens in one of the attributes, we want to let users know about it when they access other attributes of the same object. Due to small amount of restrictions in rp66 it can happen that we correctly process already broken attributes and do not notice that they are spoiled. Thus user will trust these attributes when they shouldn't. In theory same can happen for objects (users will believe that object is ok, while in reality it happened to be be processed by accident). This situation is however very unlikely. So in order to not introduce too many log messages, that situation is skipped. As long as errors of at least MINOR level on attributes exist, MINOR message will be logged for corresponding object. --- lib/extension/dlisio/ext/types.hpp | 8 ++++++++ lib/src/parse.cpp | 20 ++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index 72a0ba322..8cdb30a7d 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -524,6 +524,7 @@ struct basic_object { dl::obname object_name; dl::ident type; std::vector< object_attribute > attributes; + std::vector< dl::dlis_error > log; }; /* Object set @@ -555,6 +556,11 @@ struct basic_object { * encrypted records cannot be parsed by dlisio without being decrypted first. * As object_set does its parsing itself, it _will_ fail on construction if * given an encrypted record. + * + * Log: + * + * As well as attributes, every object set should have information about issues + * that arose during parsing. */ using object_vector = std::vector< basic_object >; @@ -566,6 +572,8 @@ struct object_set { dl::ident type; dl::ident name; + std::vector< dl::dlis_error > log; + dl::object_vector& objects() noexcept (false); private: dl::record record; diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index 5fc31c67c..3286b28a1 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -896,6 +896,14 @@ noexcept (false) } } +bool is_log_clear( const std::vector< dlis_error >& log ) noexcept (true) { + for (const auto& entry : log) { + if (entry.severity >= dl::error_severity::MINOR) + return false; + } + return true; +} + } const char* object_set::parse_objects(const char* cur) noexcept (false) { @@ -913,6 +921,7 @@ const char* object_set::parse_objects(const char* cur) noexcept (false) { auto current = default_object; current.type = type; if (object_flags.name) cur = cast( cur, current.object_name ); + bool object_clear = true; for (const auto& template_attr : tmpl) { if (template_attr.invariant) continue; @@ -1004,9 +1013,20 @@ const char* object_set::parse_objects(const char* cur) noexcept (false) { patch_missing_value( attr ); } + object_clear = object_clear && is_log_clear(attr.log); + current.set(attr); } + if (not object_clear) { + const auto msg = + "One or more attributes of this object violate specification. " + "This can potentially corrupt the entire object"; + dlis_error err{ + dl::error_severity::MINOR, msg, "", ""}; + current.log.push_back(err); + } + this->objs.push_back( std::move( current ) ); } From 3d963340a3da83ae199d495a80d0ac01b4925443 Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Fri, 6 Nov 2020 11:38:42 +0100 Subject: [PATCH 10/25] Move check for broken descriptors to higer level We check for discrepancy with specification the earliest possible. While we can't decide right away if 'attribute' flags are correct as it depends on whether attribute belongs to object or template, we can do so for 'object's and 'set's. By doing so we "fixed" the flags early, thus partly obscuring information. Now all the checks are postponed until before the parsing affected items themselves. No flags are overwritten. Commit doesn't affect existing behavior. --- lib/src/dlisio.cpp | 5 +- lib/src/parse.cpp | 124 +++++++++++++++++++++++------------------- lib/test/protocol.cpp | 8 +-- 3 files changed, 73 insertions(+), 64 deletions(-) diff --git a/lib/src/dlisio.cpp b/lib/src/dlisio.cpp index f4e86f41a..310250338 100644 --- a/lib/src/dlisio.cpp +++ b/lib/src/dlisio.cpp @@ -499,15 +499,14 @@ int dlis_component_set( std::uint8_t desc, int role, int* type, int* name ) { *type = desc & (1 << 4); *name = desc & (1 << 3); - if( !*type ) return DLIS_INCONSISTENT; return DLIS_OK; } int dlis_component_object( std::uint8_t desc, int role, int* obname ) { - if( role != DLIS_ROLE_OBJECT ) return DLIS_UNEXPECTED_VALUE; + if( role != DLIS_ROLE_OBJECT ) + return DLIS_UNEXPECTED_VALUE; *obname = desc & (1 << 4); - if( !*obname ) return DLIS_INCONSISTENT; return DLIS_OK; } diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index 3286b28a1..7fd0b3b08 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -33,45 +33,27 @@ set_descriptor parse_set_descriptor( const char* cur ) noexcept (false) { set_descriptor flags; dlis_component( attr, &flags.role ); - switch (flags.role) { - case DLIS_ROLE_RDSET: - case DLIS_ROLE_RSET: - case DLIS_ROLE_SET: - break; - - default: { - const auto bits = std::bitset< 8 >{ attr }.to_string(); - const auto role = dlis_component_str(flags.role); - const auto msg = "error parsing object set descriptor: " - "expected SET, RSET or RDSET, was {} ({})" - ; - throw std::invalid_argument(fmt::format(msg, role, bits)); - } - } - int type, name; const auto err = dlis_component_set( attr, flags.role, &type, &name ); - flags.type = type; - flags.name = name; switch (err) { case DLIS_OK: break; - - case DLIS_INCONSISTENT: - /* - * 3.2.2.2 Component usage - * The Set Component contains the Set Type, which is not optional - * and must not be null, and the Set Name, which is optional. - */ - user_warning( "SET:type not set, but must be non-null." ); - flags.type = true; - break; - + case DLIS_UNEXPECTED_VALUE: { + const auto bits = std::bitset< 8 >{ attr }.to_string(); + const auto role = dlis_component_str(flags.role); + const auto msg = "error parsing object set descriptor: " + "expected SET, RSET or RDSET, was {} ({})" + ; + throw std::invalid_argument(fmt::format(msg, role, bits)); + } default: throw std::runtime_error("unhandled error in dlis_component_set"); } + flags.type = type; + flags.name = name; + return flags; } @@ -107,18 +89,10 @@ attribute_descriptor parse_attribute_descriptor( const char* cur ) { case DLIS_ROLE_INVATR: flags.invariant = true; - - case DLIS_ROLE_ATTRIB: break; - default: { - const auto bits = std::bitset< 8 >(role).to_string(); - const auto was = dlis_component_str(role); - const auto msg = "error parsing attribute descriptor: " - "expected ATTRIB, INVATR, ABSATR or OBJECT, was {} ({})" - ; - throw std::invalid_argument(fmt::format(msg, was, bits)); - } + default: + break; } if (flags.object || flags.absent) return flags; @@ -129,17 +103,30 @@ attribute_descriptor parse_attribute_descriptor( const char* cur ) { &reprc, &units, &value ); + + switch (err) { + case DLIS_OK: + break; + case DLIS_UNEXPECTED_VALUE: { + const auto bits = std::bitset< 8 >(role).to_string(); + const auto was = dlis_component_str(role); + const auto msg = "error parsing attribute descriptor: " + "expected ATTRIB, INVATR, ABSATR or OBJECT, " + "was {} ({})"; + throw std::invalid_argument(fmt::format(msg, was, bits)); + } + default: + throw std::runtime_error( "unhandled error in " + "dlis_component_attrib" ); + } + flags.label = label; flags.count = count; flags.reprc = reprc; flags.units = units; flags.value = value; - if (!err) return flags; - - // all sources for this error should've been checked, so - // something is REALLY wrong if we end up here - throw std::runtime_error( "unhandled error in dlis_component_attrib" ); + return flags; } struct object_descriptor { @@ -153,20 +140,29 @@ object_descriptor parse_object_descriptor( const char* cur ) { int role; dlis_component( attr, &role ); - if (role != DLIS_ROLE_OBJECT) { - const auto bits = std::bitset< 8 >{ attr }.to_string(); - const auto was = dlis_component_str(role); - const auto msg = "error parsing object descriptor: " - "expected OBJECT, was {} ({})" - ; - throw std::invalid_argument(fmt::format(msg, was, bits)); - } - int name; const auto err = dlis_component_object( attr, role, &name ); - if (err) user_warning( "OBJECT:name was not set, but must be non-null" ); - return { true }; + switch (err) { + case DLIS_OK: + break; + case DLIS_UNEXPECTED_VALUE: { + const auto bits = std::bitset< 8 >{ attr }.to_string(); + const auto was = dlis_component_str(role); + const auto msg = "error parsing object descriptor: " + "expected OBJECT, was {} ({})" + ; + throw std::invalid_argument(fmt::format(msg, was, bits)); + } + default: + throw std::runtime_error("unhandled error in " + "dlis_component_object"); + } + + object_descriptor flags; + flags.name = name; + + return flags; } using std::swap; @@ -920,9 +916,14 @@ const char* object_set::parse_objects(const char* cur) noexcept (false) { auto current = default_object; current.type = type; - if (object_flags.name) cur = cast( cur, current.object_name ); bool object_clear = true; + if (object_flags.name) { + user_warning( "OBJECT:name was not set, but must be non-null" ); + } + + cur = cast( cur, current.object_name ); + for (const auto& template_attr : tmpl) { if (template_attr.invariant) continue; if (cur == end) break; @@ -1055,7 +1056,16 @@ const char* object_set::parse_set_component(const char* cur) noexcept (false) { dl::ident tmp_type; dl::ident tmp_name; - if (flags.type) cur = cast( cur, tmp_type ); + if (!flags.type) { + /* + * 3.2.2.2 Component usage + * The Set Component contains the Set Type, which is not optional + * and must not be null, and the Set Name, which is optional. + */ + user_warning( "SET:type not set, but must be non-null." ); + } + + cur = cast( cur, tmp_type ); if (flags.name) cur = cast( cur, tmp_name ); this->type = tmp_type; diff --git a/lib/test/protocol.cpp b/lib/test/protocol.cpp index ca9fda3b3..2de765a3d 100644 --- a/lib/test/protocol.cpp +++ b/lib/test/protocol.cpp @@ -203,21 +203,21 @@ TEST_CASE("Set descriptors", "[component][v1]") { err = dlis_component_set( N, role, &type, &name ); CHECK( !type ); CHECK( name ); - CHECK( err == DLIS_INCONSISTENT ); + CHECK( err == DLIS_OK ); } SECTION("SET: ") { err = dlis_component_set( Z, role, &type, &name ); CHECK( !type ); CHECK( !name ); - CHECK( err == DLIS_INCONSISTENT ); + CHECK( err == DLIS_OK ); } SECTION("SET: reserved values") { err = dlis_component_set( R, DLIS_ROLE_RDSET, &type, &name ); CHECK( !type ); CHECK( !name ); - CHECK( err == DLIS_INCONSISTENT ); + CHECK( err == DLIS_OK ); } } @@ -244,7 +244,7 @@ TEST_CASE("Object descriptors", "[component][v1]") { SECTION("Object: ") { err = dlis_component_object( O, role, &name ); - CHECK( err == DLIS_INCONSISTENT ); + CHECK( err == DLIS_OK ); CHECK( !name ); } From c29f60498d12f973504be0eb82343d52a1c9e90a Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Fri, 6 Nov 2020 11:37:02 +0100 Subject: [PATCH 11/25] Propagate object set warnings to Python --- lib/extension/dlisio/ext/types.hpp | 6 +- lib/src/parse.cpp | 94 ++++++++++++++++++++--------- python/dlisio/ext/core.cpp | 6 +- python/dlisio/file.py | 4 +- python/tests/test_logical_record.py | 22 ++++--- 5 files changed, 89 insertions(+), 43 deletions(-) diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index 8cdb30a7d..5c5617c9f 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -604,10 +604,12 @@ class pool { object_vector get(const std::string& type, const std::string& name, - const dl::matcher& matcher) noexcept (false); + const dl::matcher& matcher, + const error_handler& errorhandler) noexcept (false); object_vector get(const std::string& type, - const dl::matcher& matcher) noexcept (false); + const dl::matcher& matcher, + const error_handler& errorhandler) noexcept (false); private: std::vector< dl::object_set > eflrs; diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index 7fd0b3b08..7138b1e49 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -12,14 +12,6 @@ namespace { -void user_warning( const std::string& ) noexcept (true) { - // TODO: -} - -void debug_warning( const std::string& ) noexcept (true) { - // TODO: -} - struct set_descriptor { int role; bool type; @@ -715,22 +707,30 @@ const char* object_set::parse_template(const char* cur) noexcept (false) { cur += DLIS_DESCRIPTOR_SIZE; if (flags.absent) { - user_warning( "ABSATR in object template - skipping" ); + dlis_error err { + dl::error_severity::MAJOR, + "Absent Attribute in object set template", + "3.2.2.2 Component Usage: A Template consists of a collection " + "of Attribute Components and/or Invariant Attribute " + "Components, mixed in any fashion.", + "Attribute not included in template" + }; + this->log.push_back(err); continue; } object_attribute attr; if (!flags.label) { - /* - * 3.2.2.2 Component usage - * All Components in the Template must have distinct, non-null - * Labels. - * - * Assume that if this isn't set properly it's a corrupted - * descriptor, so just try to read the label anyway - */ - user_warning( "Label not set, but must be non-null" ); + dlis_error err { + dl::error_severity::MAJOR, + "Label not set in template", + "3.2.2.2 Component Usage: All Components in the Template must " + "have distinct, non-null Labels.", + "Assumed attribute descriptor corrupted, attempt to read " + "label anyway" + }; + this->log.push_back(err); } cur = cast( cur, attr.label ); @@ -743,7 +743,13 @@ const char* object_set::parse_template(const char* cur) noexcept (false) { this->tmpl.push_back( std::move( attr ) ); if (cur == end){ - debug_warning("Set contains no objects"); + dlis_error err { + dl::error_severity::INFO, + "Set contains no objects", + "3.2.2.2 Component Usage: A Set consists of one or more Objects", + "Leave the set empty and return" + }; + this->log.push_back(err); return cur; } } @@ -900,6 +906,19 @@ bool is_log_clear( const std::vector< dlis_error >& log ) noexcept (true) { return true; } +void report_set_errors(const dl::object_set& eflr, + const dl::error_handler& errorhandler) { + if (eflr.log.size()) { + const auto context = "object set of type '" + + dl::decay(eflr.type) + "' named '" + + dl::decay(eflr.name) +"'"; + for (const auto &err : eflr.log) { + errorhandler.log(err.severity, context, err.problem, + err.specification, err.action); + } + } +} + } const char* object_set::parse_objects(const char* cur) noexcept (false) { @@ -918,8 +937,16 @@ const char* object_set::parse_objects(const char* cur) noexcept (false) { current.type = type; bool object_clear = true; - if (object_flags.name) { - user_warning( "OBJECT:name was not set, but must be non-null" ); + if (!object_flags.name) { + dlis_error err { + dl::error_severity::MAJOR, + "OBJECT:name was not set", + "3.2.2.1 Component Descriptor: That is, every Object has " + "a non-null Name", + "Assumed object descriptor corrupted, attempt to read name " + "anyway" + }; + current.log.push_back(err); } cur = cast( cur, current.object_name ); @@ -1057,12 +1084,15 @@ const char* object_set::parse_set_component(const char* cur) noexcept (false) { dl::ident tmp_name; if (!flags.type) { - /* - * 3.2.2.2 Component usage - * The Set Component contains the Set Type, which is not optional - * and must not be null, and the Set Name, which is optional. - */ - user_warning( "SET:type not set, but must be non-null." ); + dlis_error err { + dl::error_severity::MAJOR, + "SET:type not set", + "3.2.2.1 Component Descriptor: A Set’s Type Characteristic must " + "be non-null and must always be explicitly present in " + "the Set Component", + "Assumed set descriptor corrupted, attempt to read type anyway" + }; + this->log.push_back(err); } cur = cast( cur, tmp_type ); @@ -1106,7 +1136,8 @@ std::vector< dl::ident > pool::types() const noexcept (true) { object_vector pool::get(const std::string& type, const std::string& name, - const dl::matcher& m) + const dl::matcher& m, + const error_handler& errorhandler) noexcept (false) { object_vector objs; @@ -1118,12 +1149,15 @@ noexcept (false) { objs.push_back(obj); } + + report_set_errors (eflr, errorhandler); } return objs; } object_vector pool::get(const std::string& type, - const dl::matcher& m) + const dl::matcher& m, + const error_handler& errorhandler) noexcept (false) { object_vector objs; @@ -1132,6 +1166,8 @@ noexcept (false) { auto tmp = eflr.objects(); objs.insert(objs.end(), tmp.begin(), tmp.end()); + + report_set_errors (eflr, errorhandler); } return objs; } diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index 469d22abc..bc7ce97da 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -916,11 +916,13 @@ PYBIND11_MODULE(core, m) { .def( "get", (dl::object_vector (dl::pool::*) ( const std::string&, const std::string&, - const dl::matcher& + const dl::matcher&, + const dl::error_handler& )) &dl::pool::get ) .def( "get", (dl::object_vector (dl::pool::*) ( const std::string&, - const dl::matcher& + const dl::matcher&, + const dl::error_handler& )) &dl::pool::get ) ; diff --git a/python/dlisio/file.py b/python/dlisio/file.py index de0300d41..89710594c 100644 --- a/python/dlisio/file.py +++ b/python/dlisio/file.py @@ -394,9 +394,9 @@ def find(self, objecttype, objectname=None, matcher=None): matcher = settings.regex if not objectname: - attics = self.object_pool.get(objecttype, matcher) + attics = self.object_pool.get(objecttype, matcher, self.error_handler) else: - attics = self.object_pool.get(objecttype, objectname, matcher) + attics = self.object_pool.get(objecttype, objectname, matcher, self.error_handler) objects = [] for attic in attics: diff --git a/python/tests/test_logical_record.py b/python/tests/test_logical_record.py index e3efe8fb0..311a24785 100644 --- a/python/tests/test_logical_record.py +++ b/python/tests/test_logical_record.py @@ -100,8 +100,7 @@ def test_absent_attribute(tmpdir, merge_files_oneLR): _ = obj.attic['DEFAULT_ATTRIBUTE'] -@pytest.mark.future_warning_absent_attr_in_template -def test_absent_attribute_in_template(tmpdir, merge_files_oneLR): +def test_absent_attribute_in_template(tmpdir, merge_files_oneLR, assert_log): path = os.path.join(str(tmpdir), 'absent-attribute-in-template.dlis') content = [ 'data/chap3/start.dlis.part', @@ -114,6 +113,7 @@ def test_absent_attribute_in_template(tmpdir, merge_files_oneLR): with dlisio.load(path) as (f, *tail): obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) assert obj.attic['DEFAULT_ATTRIBUTE'].value + assert_log("Absent Attribute in object set template") @pytest.mark.future_test_attributes def test_global_default_attribute(tmpdir, merge_files_oneLR): @@ -301,6 +301,7 @@ def test_repcode_invalid_in_template_no_value_fixed(tmpdir, merge_files_oneLR, with dlisio.load(path) as (f, *_): obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) + attr = obj.attic['INVALID'] assert attr.value == [1, 2, 3, 4] @@ -429,7 +430,8 @@ def test_repcode_invalid_in_objects_no_value(tmpdir, merge_files_oneLR, attr = obj.attic['GLOBAL_DEFAULT_ATTRIBUTE'] assert attr.value == [1, 2, 3, 4] -def test_repcode_different_no_value(tmpdir, merge_files_oneLR, assert_log): +def test_repcode_different_no_value(tmpdir, merge_files_oneLR, assert_log, + assert_info): path = os.path.join(str(tmpdir), 'different-repcode-no-value.dlis') content = [ 'data/chap3/start.dlis.part', @@ -518,9 +520,8 @@ def test_label_bit_set_in_attribute(tmpdir, merge_files_oneLR, assert_log): _ = obj['DEFAULT_ATTRIBUTE'] assert_log("Label bit") -@pytest.mark.future_warning_label_bit_not_set_in_template @pytest.mark.not_implemented_datetime_timezone -def test_label_bit_not_set_in_template(tmpdir, merge_files_oneLR): +def test_label_bit_not_set_in_template(tmpdir, merge_files_oneLR, assert_log): path = os.path.join(str(tmpdir), 'label-bit-not-set-in-template.dlis') content = [ 'data/chap3/start.dlis.part', @@ -534,10 +535,10 @@ def test_label_bit_not_set_in_template(tmpdir, merge_files_oneLR): attr = obj.attic['NEW_ATTRIBUTE'] dt = datetime(2033, 4, 19, 20, 39, 58, 103000) assert attr.value == [dt] + assert_log("Label not set in template") -@pytest.mark.future_warning_set_type_bit_not_set -def test_set_type_bit_not_set_in_set(tmpdir, merge_files_oneLR): +def test_set_type_bit_not_set_in_set(tmpdir, merge_files_oneLR, assert_log): path = os.path.join(str(tmpdir), 'set-type-not-set.dlis') content = [ 'data/chap3/sul.dlis.part', @@ -550,6 +551,9 @@ def test_set_type_bit_not_set_in_set(tmpdir, merge_files_oneLR): with dlisio.load(path) as (f, *tail): obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) assert obj.attic['DEFAULT_ATTRIBUTE'].value + assert_log("Problem: SET:type not set\n" + "Where: object set of type 'VERY_MUCH_TESTY_SET' " + "named ''") @pytest.mark.future_warning_object_name_bit_not_set @@ -756,7 +760,8 @@ def test_cut_before_template(tmpdir, merge_files_oneLR): assert "unexpected end-of-record" in str(excinfo.value) -def test_cut_before_object(tmpdir, merge_files_oneLR): +def test_cut_before_object(tmpdir, merge_files_oneLR, assert_debug): + # otherwise known as "set with no objects" test path = os.path.join(str(tmpdir), 'cut-before-object.dlis') content = [ 'data/chap3/start.dlis.part', @@ -766,6 +771,7 @@ def test_cut_before_object(tmpdir, merge_files_oneLR): with dlisio.load(path) as (f,): objects = f.find('.*', '.*') assert len(objects) == 0 + assert_debug("Set contains no objects") @pytest.mark.skip(reason="result inconsistent") From 0bdc3cab07782c1d437abb733fc6ac7b2bc34147 Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Fri, 6 Nov 2020 11:55:07 +0100 Subject: [PATCH 12/25] Propagate object warnings to Python --- python/dlisio/ext/core.cpp | 1 + python/dlisio/plumbing/basicobject.py | 11 +++++++++++ python/tests/test_logical_record.py | 17 ++++++++++++++--- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index bc7ce97da..62525f5ba 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -882,6 +882,7 @@ PYBIND11_MODULE(core, m) { py::class_< dl::basic_object >( m, "basic_object" ) .def_readonly("type", &dl::basic_object::type) .def_readonly("name", &dl::basic_object::object_name) + .def_readonly("log", &dl::basic_object::log) .def( "__len__", []( const dl::basic_object& o ) { return o.attributes.size(); }) diff --git a/python/dlisio/plumbing/basicobject.py b/python/dlisio/plumbing/basicobject.py index d45eddc58..4bef50b1b 100644 --- a/python/dlisio/plumbing/basicobject.py +++ b/python/dlisio/plumbing/basicobject.py @@ -196,6 +196,17 @@ def __getitem__(self, key): # No rule for parsing, keep rp66value as is, i.e. vector parse_as = vector + # report errors before checking for key presence - it might be a symptom + if len(self.attic.log) > 0: + # TODO: here and for attribute: we use fingerprint to report + # context, not repr, to have origina and copynr, but is fingerprint + # clear enough for the user? + context = "{}".format(self.fingerprint) + for error in self.attic.log: + self.logicalfile.error_handler.log( + error.severity, context, error.problem, + error.specification, error.action) + try: attribute = self.attic[key] except KeyError: diff --git a/python/tests/test_logical_record.py b/python/tests/test_logical_record.py index 311a24785..2d0e55eaa 100644 --- a/python/tests/test_logical_record.py +++ b/python/tests/test_logical_record.py @@ -294,6 +294,7 @@ def test_repcode_invalid_in_template_no_value_fixed(tmpdir, merge_files_oneLR, content = [ 'data/chap3/start.dlis.part', 'data/chap3/template/invalid-repcode-no-value.dlis.part', + 'data/chap3/template/default.dlis.part', 'data/chap3/object/object.dlis.part', 'data/chap3/objattr/all-set.dlis.part', ] @@ -302,11 +303,18 @@ def test_repcode_invalid_in_template_no_value_fixed(tmpdir, merge_files_oneLR, with dlisio.load(path) as (f, *_): obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) + _ = obj['DEFAULT_ATTRIBUTE'] + assert_info("Problem: One or more attributes " + "of this object violate specification") + assert_info("Where: T.VERY_MUCH_TESTY_SET-I.OBJECT-O.1-C.1") + attr = obj.attic['INVALID'] assert attr.value == [1, 2, 3, 4] _ = obj['INVALID'] - assert_info("Invalid representation code 0") + assert_info("Problem: Invalid representation code 0\n" + "Where: " + "T.VERY_MUCH_TESTY_SET-I.OBJECT-O.1-C.1-A.INVALID") def test_repcode_invalid_in_template_no_value_not_fixed(tmpdir, merge_files_oneLR, @@ -556,8 +564,8 @@ def test_set_type_bit_not_set_in_set(tmpdir, merge_files_oneLR, assert_log): "named ''") -@pytest.mark.future_warning_object_name_bit_not_set -def test_object_name_bit_not_set_in_object(tmpdir, merge_files_oneLR): +def test_object_name_bit_not_set_in_object(tmpdir, merge_files_oneLR, + assert_log): path = os.path.join(str(tmpdir), 'no-object-name-bit.dlis') content = [ 'data/chap3/start.dlis.part', @@ -569,6 +577,9 @@ def test_object_name_bit_not_set_in_object(tmpdir, merge_files_oneLR): with dlisio.load(path) as (f, *tail): obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) assert obj.attic['DEFAULT_ATTRIBUTE'].value + _ = obj['DEFAULT_ATTRIBUTE'] + assert_log("Problem: OBJECT:name was not set\n" + "Where: T.VERY_MUCH_TESTY_SET-I.OBJECT-O.1-C.1") @pytest.mark.future_test_attributes From 223c54337df888e9d5b0658e3fb0a1eca648008a Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Thu, 17 Sep 2020 18:02:43 +0200 Subject: [PATCH 13/25] Inform users about redundant and replacement sets While we do not support neither redundant nor replacement sets, we need to inform users if they ever appear. As of now we haven't seen them used in real files. Both can cause certain problems for the users, but redundant data might be handled fairly well, hence we inform users through minor error. Replacement sets might be a bigger problem for the user, hence major error is reported. --- lib/src/parse.cpp | 30 ++++++++++++++++++++ python/data/chap3/set/redundant.dlis.part | Bin 0 -> 19 bytes python/data/chap3/set/replacement.dlis.part | Bin 0 -> 21 bytes python/tests/test_logical_record.py | 30 ++++++++++++++++++++ 4 files changed, 60 insertions(+) create mode 100644 python/data/chap3/set/redundant.dlis.part create mode 100644 python/data/chap3/set/replacement.dlis.part diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index 7138b1e49..5f6cec809 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -1075,6 +1075,36 @@ const char* object_set::parse_set_component(const char* cur) noexcept (false) { throw std::out_of_range( msg ); } + switch (flags.role) { + case DLIS_ROLE_RDSET: { + dlis_error err { + dl::error_severity::MINOR, + "Redundant sets are not supported by dlisio", + "3.2.2.2 Component Usage: A Redundant Set is an identical copy " + "of some Set written previously in the same Logical File", + "Redundant set is treated as a normal set, which might lead " + "to issues with duplicated objects" + }; + this->log.push_back(err); + break; + } + case DLIS_ROLE_RSET: { + dlis_error err { + dl::error_severity::MAJOR, + "Replacement sets are not supported by dlisio", + "3.2.2.2 Component Usage: Attributes of the Replacement Set " + "reflect all updates that may have been applied since the " + "original Set was written", + "Replacement set is treated as a normal set, which might lead " + "to issues with duplicated objects and invalid information" + }; + this->log.push_back(err); + break; + } + default: + break; + } + /* * TODO: check for every read that inside [begin,end)? */ diff --git a/python/data/chap3/set/redundant.dlis.part b/python/data/chap3/set/redundant.dlis.part new file mode 100644 index 0000000000000000000000000000000000000000..4e881f166f7dc0fb203f5104b63d25bc3cfab86f GIT binary patch literal 19 acmZQr^`DVpRfE6=&LCHpP(K$(zYqXIsRly; literal 0 HcmV?d00001 diff --git a/python/data/chap3/set/replacement.dlis.part b/python/data/chap3/set/replacement.dlis.part new file mode 100644 index 0000000000000000000000000000000000000000..7b0a210f7f1dcd6fd4b1309e065f6d7e8c33e93c GIT binary patch literal 21 ccmZQr^`DVpRfE6 Date: Wed, 28 Oct 2020 11:09:41 +0100 Subject: [PATCH 14/25] Add error handler tests We detect various specification violation. Some of them we consider insignificant enough and recover. Out users can, however, disagree with us about importance of certain specification violations. Hence we give users a little bit more control over it. For example, users want to receive an error every time something is spoiled even a little bit. Or on the opposite - users may want to get as much data as possible, accepting the danger that data might be spoiled. --- python/tests/test_error_handler.py | 77 ++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 python/tests/test_error_handler.py diff --git a/python/tests/test_error_handler.py b/python/tests/test_error_handler.py new file mode 100644 index 000000000..3bac0dd3f --- /dev/null +++ b/python/tests/test_error_handler.py @@ -0,0 +1,77 @@ +""" +Testing error handler on various levels of representation +(user-facade functionality and underlying logical format) +""" + +import pytest +import os + +import dlisio + +from dlisio.errors import ErrorHandler, Actions + +errorhandler = ErrorHandler(critical = Actions.LOG_ERROR) + +# TODO: test_errors_explicit (let users examine the errors regardless of escape +# level set) + +# TODO: test mix (do not fail on truncation, but fail on bad attribute access) + +# TODO: test_truncated + +# TODO: test_extract + +# TODO: test_parse_exeptions + +def test_parse_critical_escaped(tmpdir, merge_files_oneLR, + assert_error): + path = os.path.join(str(tmpdir), 'error-on-attribute-access.dlis') + content = [ + 'data/chap3/start.dlis.part', + 'data/chap3/template/invalid-repcode-no-value.dlis.part', + 'data/chap3/object/object.dlis.part', + 'data/chap3/objattr/empty.dlis.part', + ] + merge_files_oneLR(path, content) + + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) + # value is unclear and shouldn't be trusted + _ = obj['INVALID'] + assert_error("invalid representation code") + +def test_parse_major_errored(tmpdir, merge_files_oneLR): + path = os.path.join(str(tmpdir), 'replacement-set.dlis') + content = [ + 'data/chap3/sul.dlis.part', + 'data/chap3/set/replacement.dlis.part', + 'data/chap3/template/default.dlis.part', + 'data/chap3/object/object.dlis.part' + ] + merge_files_oneLR(path, content) + + errorhandler = ErrorHandler( + major=Actions.RAISE) + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + with pytest.raises(RuntimeError) as excinfo: + _ = f.object('REPLACEMENT', 'OBJECT', 1, 1) + assert "Replacement sets are not supported" in str(excinfo.value) + +def test_parse_minor_errored(tmpdir, merge_files_oneLR): + path = os.path.join(str(tmpdir), 'redundant-set.dlis') + content = [ + 'data/chap3/sul.dlis.part', + 'data/chap3/set/redundant.dlis.part', + 'data/chap3/template/default.dlis.part', + 'data/chap3/object/object.dlis.part' + ] + merge_files_oneLR(path, content) + + errorhandler = ErrorHandler( + major=Actions.RAISE, + minor=Actions.RAISE + ) + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + with pytest.raises(RuntimeError) as excinfo: + _ = f.object('REDUNDANT', 'OBJECT', 1, 1) + assert "Redundant sets are not supported" in str(excinfo.value) From 23b88a0549501b615e9ea1b0133e491f4ae53943 Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Wed, 18 Nov 2020 15:22:27 +0100 Subject: [PATCH 15/25] Allow users to read trucated and zeroed files here are many truncated (or zeroed out) files out there. Right now we do not give users even a chance to read such files as they fail right on the load. However problem might appear only after hundreds of megabytes of correct data. Thus this commit aims to allow users to read as much as possible data from such files. By using error escape setting it's up to users to decide if they want to process broken files or not. While dlisio doesn't attempt to recover and find out if there is anything useful beyond error offset, already processed data will be returned. Note that we want to catch and handle any wrong situation that can happen in dlisio files (even though we do not know about its existence yet). Current implementation (here and in further commits) is a result of fierce debate about the best approach on how to tackle the issue. --- lib/extension/dlisio/ext/io.hpp | 3 +- lib/src/io.cpp | 156 ++++++++++++++++-------- python/data/chap2/README.rst | 4 +- python/data/chap2/zeroed-in-1st-lr.dlis | Bin 304 -> 608 bytes python/dlisio/ext/core.cpp | 7 +- python/dlisio/load.py | 11 +- python/tests/test_error_handler.py | 47 ++++++- python/tests/test_loading.py | 10 ++ python/tests/test_physical_layout.py | 12 +- python/tests/test_tif.py | 2 +- 10 files changed, 187 insertions(+), 65 deletions(-) diff --git a/lib/extension/dlisio/ext/io.hpp b/lib/extension/dlisio/ext/io.hpp index c919169f6..0825b3139 100644 --- a/lib/extension/dlisio/ext/io.hpp +++ b/lib/extension/dlisio/ext/io.hpp @@ -58,6 +58,7 @@ class stream { struct stream_offsets { std::vector< long long > explicits; std::vector< long long > implicits; + std::vector< long long > broken; }; stream open(const std::string&, std::int64_t) noexcept (false); @@ -71,7 +72,7 @@ bool hastapemark(stream&) noexcept (false); dl::record extract(stream&, long long) noexcept (false); dl::record& extract(stream&, long long, long long, dl::record&) noexcept (false); -stream_offsets findoffsets(dl::stream&) noexcept (false); +stream_offsets findoffsets(dl::stream&, dl::error_handler&) noexcept (false); std::map< dl::ident, std::vector< long long > > findfdata(dl::stream&, const std::vector< long long >&) noexcept (false); diff --git a/lib/src/io.cpp b/lib/src/io.cpp index 0870106fc..650b33e46 100644 --- a/lib/src/io.cpp +++ b/lib/src/io.cpp @@ -210,23 +210,6 @@ noexcept (false) { } } -void check_for_truncation(dl::stream& file, std::int64_t offset) { - file.seek(offset - 1); - char tmp; - /* - * lfp returns UNEXPECTED_EOF for cfile when truncation happens - * inside of declared data - * TODO: if dlisio will support other types of io, this might have - * to be reconsidered - */ - try { - file.read(&tmp, 1); - } catch (const std::runtime_error& e) { - throw std::runtime_error("findoffsets: file truncated"); - } - return; -} - } stream::stream( lfp_protocol* f ) noexcept (false){ @@ -405,69 +388,142 @@ record& extract(stream& file, long long tell, long long bytes, record& rec) noex } } -stream_offsets findoffsets( dl::stream& file) noexcept (false) { +stream_offsets findoffsets( dl::stream& file, dl::error_handler& errorhandler) +noexcept (false) { stream_offsets ofs; - std::int64_t offset = 0; + std::int64_t lr_offset = 0; + std::int64_t lrs_offset = 0; + bool has_successor = false; char buffer[ DLIS_LRSH_SIZE ]; + const auto handle = [&]( const std::string& problem ) { + const auto context = "dl::findoffsets (indexing logical file)"; + errorhandler.log(dl::error_severity::CRITICAL, context, problem, "", + "Indexing is suspended at last valid Logical Record"); + ofs.broken.push_back( lr_offset ); + }; + int len = 0; + auto read = 0; + + file.seek(lrs_offset); + while (true) { - file.seek(offset); - file.read(buffer, DLIS_LRSH_SIZE); + try { + read = file.read(buffer, DLIS_LRSH_SIZE); + } catch (std::exception& e) { + handle(e.what()); + break; + } + + // read operation is enough to set eof correctly if (file.eof()) { - check_for_truncation(file, offset); - if (has_successor) { - auto msg = "File is over, but last logical record segment " - "expects successor"; - throw std::runtime_error(msg); + if (read == 0) { + if (has_successor) { + const auto problem = + "Reached EOF, but last logical record segment expects " + "successor"; + handle(problem); + } + break; } - break; + if (read < 4) { + /* + * Very unlikely situation. + * Usually exception will be thrown during read. + */ + const auto problem = + "File truncated in Logical Record Header"; + handle(problem); + break; + } + /* + * Do nothing if read == 4 + * This might be the problem for next Logical File. + * If not, it will be dealt with later + */ } int type; std::uint8_t attrs; dlis_lrsh( buffer, &len, &attrs, &type ); if (len < 4) { - auto msg = "Too short logical record. Length can't be less than 4, " - "but was {}"; - throw std::runtime_error(fmt::format(msg, len)); + const auto problem = + "Too short logical record. Length can't be less than 4, " + "but was {}"; + handle(fmt::format(problem, len)); + break; } - int isexplicit = attrs & DLIS_SEGATTR_EXFMTLR; - if (not (attrs & DLIS_SEGATTR_PREDSEG)) { + bool isexplicit = attrs & DLIS_SEGATTR_EXFMTLR; + bool has_predecessor = attrs & DLIS_SEGATTR_PREDSEG; + + if (not (has_predecessor)) { if (isexplicit and type == 0 and ofs.explicits.size()) { /* - * Wrap up when we encounter a EFLR of type FILE-HEADER that is - * NOT the first Logical Record. More precisely we expect the - * _first_ LR we encounter to be a FILE-HEADER. We gather up - * this LR and all successive LR's until we encounter another - * FILE-HEADER. + * Wrap up when we encounter a EFLR of type FILE-HEADER that + * is NOT the first Logical Record. More precisely we expect + * the _first_ LR we encounter to be a FILE-HEADER. We + * gather up this LR and all successive LR's until we + * encounter another FILE-HEADER. */ - check_for_truncation(file, offset); if (has_successor) { - auto msg = "New logical file appears, but previous logical " - "record segment expects successor"; - throw std::runtime_error(msg); + const auto problem = + "End of logical file, but last logical " + "record segment expects successor"; + handle(problem); + break; } - file.seek( offset ); + // seek to assure handle is in the right place to read next LF + file.seek( lrs_offset ); break; } - if (isexplicit) ofs.explicits.push_back( offset ); + } + + has_successor = attrs & DLIS_SEGATTR_SUCCSEG; + lrs_offset += len; + + /* + * Skip the segment by moving the cursor to the next offset. + * Seek operation alone isn't enough to correctly set EOF. To make sure + * record is not truncated, read its last byte instead of seeking + * to the new offset. + * + * Note that lfp returns UNEXPECTED_EOF for cfile when truncation + * happens inside of declared data + * TODO: assure behavior for other io types + */ + + char tmp; + file.seek(lrs_offset - 1); + try { + file.read(&tmp, 1); + } catch (const std::runtime_error& e) { + const auto problem = "File truncated in Logical Record Segment"; + handle(problem); + break; + } + + + if (not (has_successor)) { + if (isexplicit) + ofs.explicits.push_back( lr_offset ); /* * Consider doing fdata-indexing on the fly as we are now at the - * correct offset to read the OBNAME. That would mean we only need - * to traverse the file a single time to index it. Additionally it - * would make the caller code from python way nicer. + * correct offset to read the OBNAME. That would mean we only + * need to traverse the file a single time to index it. + * Additionally it would make the caller code from python way + * nicer. */ - else ofs.implicits.push_back( offset ); - } + else + ofs.implicits.push_back( lr_offset ); - has_successor = attrs & DLIS_SEGATTR_SUCCSEG; - offset += len; + lr_offset = lrs_offset; + } } return ofs; } diff --git a/python/data/chap2/README.rst b/python/data/chap2/README.rst index 95dd457f5..37e2c3171 100644 --- a/python/data/chap2/README.rst +++ b/python/data/chap2/README.rst @@ -95,7 +95,9 @@ wrong-lrhs.dlis Mismatch between logical record segment length) zeroed-in-1st-lr.dlis VR is expected to contain over 1 LR. Data is - zeroed from middle of 1st LR + zeroed from middle of 1st LR. There are 512 + zeroes (more than default 200 bytes + findvrl value) zeroed-in-2nd-lr.dlis VR contains 2 LRs. Data is zeroed in the middle of 2nd LR diff --git a/python/data/chap2/zeroed-in-1st-lr.dlis b/python/data/chap2/zeroed-in-1st-lr.dlis index d403f48f144b22612991bfa6a9aae380187add64..8add87bdf87d9c10f2f219f11a47c244c21af756 100644 GIT binary patch delta 13 UcmdnM^nhi90prAi0}~4h046O3ssI20 delta 7 OcmaFBvVm!X0V4nmC<1r@ diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index 62525f5ba..cddb3c2f1 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -1023,9 +1023,10 @@ PYBIND11_MODULE(core, m) { m.def( "hastapemark", dl::hastapemark ); m.def("findfdata", dl::findfdata); - m.def( "findoffsets", []( dl::stream& file ) { - const auto ofs = dl::findoffsets( file ); - return py::make_tuple( ofs.explicits, ofs.implicits ); + m.def( "findoffsets", []( dl::stream& file, + dl::error_handler& errorhandler) { + const auto ofs = dl::findoffsets( file, errorhandler ); + return py::make_tuple( ofs.explicits, ofs.implicits, ofs.broken ); }); py::enum_< dl::error_severity >( m, "error_severity" ) diff --git a/python/dlisio/load.py b/python/dlisio/load.py index 22e6fcba6..9a9921703 100644 --- a/python/dlisio/load.py +++ b/python/dlisio/load.py @@ -126,7 +126,7 @@ def rewind(offset, tif): if tapemarks: stream = core.open_tif(stream) stream = core.open_rp66(stream) - explicits, implicits = core.findoffsets(stream) + explicits, implicits, broken = core.findoffsets(stream, error_handler) hint = rewind(stream.absolute_tell, tapemarks) recs = core.extract(stream, explicits) @@ -137,8 +137,15 @@ def rewind(offset, tif): lf = logicalfile(stream, pool, fdata, sul, error_handler) lfs.append(lf) + if len(broken): + # do not attempt to recover or read more logical files + # if the error happened in findoffsets + # return all logical files we were able to process until now + break + + stream = core.open(path) + try: - stream = core.open(path) offset = core.findvrl(stream, hint) except RuntimeError: if stream.eof(): diff --git a/python/tests/test_error_handler.py b/python/tests/test_error_handler.py index 3bac0dd3f..292810ca3 100644 --- a/python/tests/test_error_handler.py +++ b/python/tests/test_error_handler.py @@ -17,7 +17,52 @@ # TODO: test mix (do not fail on truncation, but fail on bad attribute access) -# TODO: test_truncated +def test_unescapable_notdlis(assert_error): + path = 'data/chap2/nondlis.txt' + with pytest.raises(RuntimeError) as excinfo: + _ = dlisio.load(path, error_handler=errorhandler) + assert "could not find visible record envelope" in str(excinfo.value) + +def test_truncated_in_data(assert_error): + path = 'data/chap2/truncated-in-second-lr.dlis' + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + assert_error("File truncated in Logical Record Segment") + assert len(f.channels) == 1 + +def test_tif_truncated_in_data(assert_error): + path = 'data/tif/layout/truncated-in-data.dlis' + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + assert_error("File truncated in Logical Record Segment") + +def test_truncated_in_lrsh(assert_error): + path = 'data/chap2/truncated-in-lrsh.dlis' + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + assert_error("unexpected EOF when reading record") + assert len(f.channels) == 1 + +def test_truncated_after_lrsh_new_lf(assert_error): + path = 'data/chap2/lf-truncated-after-lrsh.dlis' + with dlisio.load(path, error_handler=errorhandler) as batch: + assert_error("File truncated in Logical Record Segment") + assert len(batch) == 2 + +def test_truncated_lr_missing_lrs_vr_over(assert_error): + path = 'data/chap2/truncated-lr-no-lrs-vr-over.dlis' + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + assert_error("last logical record segment expects successor") + assert len(f.channels) == 0 + +def test_zeroed_before_lrs(assert_error): + path = 'data/chap2/zeroed-in-1st-lr.dlis' + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + assert_error("Too short logical record") + assert len(f.channels) == 0 + +def test_zeroed_before_vr(assert_error): + path = 'data/chap2/zeroed-in-2nd-lr.dlis' + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + assert_error("Incorrect format version") + assert len(f.channels) == 1 # TODO: test_extract diff --git a/python/tests/test_loading.py b/python/tests/test_loading.py index e54acdcfb..35cf71a2c 100644 --- a/python/tests/test_loading.py +++ b/python/tests/test_loading.py @@ -33,14 +33,24 @@ def test_filehandles_closed(tmpdir): many_logical = str(tmpdir.join('many_logical')) shutil.copyfile('data/chap4-7/many-logical-files.dlis', many_logical) + offsets_error_escape = str(tmpdir.join('error_escape_zeroed')) + shutil.copyfile('data/chap2/zeroed-in-1st-lr.dlis', offsets_error_escape) + with dlisio.load(tmp) as _: pass with dlisio.load(many_logical) as fls: assert len(fls) == 3 + # error happens in 1st LF, but is escaped + errorhandler = dlisio.errors.ErrorHandler( + critical=dlisio.errors.Actions.LOG_ERROR) + with dlisio.load(offsets_error_escape, error_handler=errorhandler): + pass + os.remove(tmp) os.remove(many_logical) + os.remove(offsets_error_escape) def test_filehandles_closed_when_load_fails(tmpdir): # Check that we don't leak open filehandles on failure diff --git a/python/tests/test_physical_layout.py b/python/tests/test_physical_layout.py index 6c2ecb15f..e3fce76ef 100644 --- a/python/tests/test_physical_layout.py +++ b/python/tests/test_physical_layout.py @@ -45,7 +45,7 @@ def test_lrs_atributes_inconsistency(): def test_logical_file_lrs_inconsistency(): with pytest.raises(RuntimeError) as excinfo: _ = dlisio.load('data/chap2/lf-lrs-inconsistency.dlis') - msg = "file appears, but previous logical record segment expects successor" + msg = "logical file, but last logical record segment expects successor" assert msg in str(excinfo.value) def test_padbytes_as_large_as_record(): @@ -100,12 +100,12 @@ def test_broken_vr(): def test_truncated_in_iflr(): with pytest.raises(RuntimeError) as excinfo: _ = dlisio.load('data/chap2/truncated-in-iflr.dlis') - assert "findoffsets: file truncated" in str(excinfo.value) + assert "File truncated in Logical Record Segment" in str(excinfo.value) def test_truncated_in_second_lr(): with pytest.raises(RuntimeError) as excinfo: _ = dlisio.load('data/chap2/truncated-in-second-lr.dlis') - assert "findoffsets: file truncated" in str(excinfo.value) + assert "File truncated in Logical Record Segment" in str(excinfo.value) def test_truncated_in_lrsh(): with pytest.raises(RuntimeError) as excinfo: @@ -113,14 +113,14 @@ def test_truncated_in_lrsh(): assert "unexpected EOF when reading record" in str(excinfo.value) def test_truncated_in_lrsh_vr_over(): - with pytest.raises(ValueError) as excinfo: + with pytest.raises(RuntimeError) as excinfo: _ = dlisio.load('data/chap2/truncated-in-lrsh-vr-over.dlis') - assert "error parsing object set descriptor" in str(excinfo.value) + assert "File truncated in Logical Record Header" in str(excinfo.value) def test_truncated_after_lrsh(): with pytest.raises(RuntimeError) as excinfo: _ = dlisio.load('data/chap2/truncated-after-lrsh.dlis') - assert "findoffsets: file truncated" in str(excinfo.value) + assert "File truncated in Logical Record Segment" in str(excinfo.value) def test_truncated_lr_missing_lrs_in_vr(): with pytest.raises(RuntimeError) as excinfo: diff --git a/python/tests/test_tif.py b/python/tests/test_tif.py index 08006712b..899905028 100644 --- a/python/tests/test_tif.py +++ b/python/tests/test_tif.py @@ -96,7 +96,7 @@ def test_layout_truncated_in_data(): fpath = 'data/tif/layout/truncated-in-data.dlis' with pytest.raises(RuntimeError) as excinfo: _ = dlisio.load(fpath) - assert "findoffsets: file truncated" in str(excinfo.value) + assert "File truncated in Logical Record Segment" in str(excinfo.value) def test_layout_truncated_in_header(): fpath = 'data/tif/layout/truncated-in-header.dlis' From 8ccb9d0951702f6675b6532bb6c107ead8fa06cd Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Mon, 21 Sep 2020 16:57:19 +0200 Subject: [PATCH 16/25] Add error handler to extract method Not many files can fail inside extract method because structure is processed in findoffsets and data is postponed until parse, but it's still possible. Thus making sure that failed records won't be processed, but records after bad one will be. --- python/data/chap2/README.rst | 3 ++- python/data/chap2/padbytes-bad.dlis | Bin 264 -> 312 bytes python/dlisio/ext/core.cpp | 14 ++++++++++++-- python/dlisio/load.py | 2 +- python/tests/test_error_handler.py | 7 ++++++- python/tests/test_helpers.py | 2 +- 6 files changed, 22 insertions(+), 6 deletions(-) diff --git a/python/data/chap2/README.rst b/python/data/chap2/README.rst index 37e2c3171..a030c8f74 100644 --- a/python/data/chap2/README.rst +++ b/python/data/chap2/README.rst @@ -43,7 +43,8 @@ old-vr.dlis Older visible record, see 2.3.6.2 Format Version of rp66v1 padbytes-bad.dlis Mismatch between padbyte length and logical - record length + record length. Second LR in the file is + correct. padbytes-encrypted.dlis Explicitly formatted logical record with encrypted padbytes diff --git a/python/data/chap2/padbytes-bad.dlis b/python/data/chap2/padbytes-bad.dlis index 17d370099041efccca89f3dc3c39938920da0fa0..1908537566ef52d8843edf4824020dfb804afc9f 100644 GIT binary patch delta 60 zcmeBR+QBp-knzRDplt>W1`Qb>IKv!$JY95yT|*350~~`KeG9l4pkn?`UW(jcF& tells) { + const std::vector< long long >& tells, + dl::error_handler& errorhandler) { std::vector< dl::record > recs; recs.reserve( tells.size() ); for (auto tell : tells) { - auto rec = dl::extract(s, tell); + dl::record rec; + try { + rec = dl::extract(s, tell); + } catch (const std::exception& e) { + const auto context = + "dl::extract: Reading raw bytes from record"; + errorhandler.log(dl::error_severity::CRITICAL, context, + e.what(), "", "Record is skipped"); + continue; + } if (rec.data.size() > 0) { recs.push_back( std::move( rec ) ); } diff --git a/python/dlisio/load.py b/python/dlisio/load.py index 9a9921703..5dfbee3b9 100644 --- a/python/dlisio/load.py +++ b/python/dlisio/load.py @@ -129,7 +129,7 @@ def rewind(offset, tif): explicits, implicits, broken = core.findoffsets(stream, error_handler) hint = rewind(stream.absolute_tell, tapemarks) - recs = core.extract(stream, explicits) + recs = core.extract(stream, explicits, error_handler) sets = core.parse_objects(recs) pool = core.pool(sets) fdata = core.findfdata(stream, implicits) diff --git a/python/tests/test_error_handler.py b/python/tests/test_error_handler.py index 292810ca3..ee8c672c1 100644 --- a/python/tests/test_error_handler.py +++ b/python/tests/test_error_handler.py @@ -64,7 +64,12 @@ def test_zeroed_before_vr(assert_error): assert_error("Incorrect format version") assert len(f.channels) == 1 -# TODO: test_extract +def test_extract_broken_padbytes(assert_error): + path = 'data/chap2/padbytes-bad.dlis' + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + assert_error("bad segment trim") + valid_obj = f.object("VALID-SET", "VALID-OBJ", 10, 0) + assert valid_obj # TODO: test_parse_exeptions diff --git a/python/tests/test_helpers.py b/python/tests/test_helpers.py index 9b3c9a87b..4f9c97e44 100644 --- a/python/tests/test_helpers.py +++ b/python/tests/test_helpers.py @@ -56,7 +56,7 @@ def test_record_attributes(): stream = core.open('data/chap2/3lr-in-vr-one-encrypted.dlis', zero=80) stream = core.open_rp66(stream) explicits = [0, 32, 64] - recs = core.extract(stream, explicits) + recs = core.extract(stream, explicits, dlisio.errors.ErrorHandler()) buffer = bytearray(1) From c4ba9c20cc7b40f448cd982fd143ed5fd7bb92d2 Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Tue, 22 Sep 2020 16:59:40 +0200 Subject: [PATCH 17/25] Add error handler to findfdata method --- lib/extension/dlisio/ext/io.hpp | 3 ++- lib/src/io.cpp | 24 ++++++++++++++---- python/data/chap3/README.rst | 5 ++-- .../chap3/implicit/fdata-broken-obname.dlis | Bin 112 -> 160 bytes python/dlisio/load.py | 2 +- python/tests/test_error_handler.py | 6 +++++ python/tests/test_logical_record.py | 3 ++- 7 files changed, 33 insertions(+), 10 deletions(-) diff --git a/lib/extension/dlisio/ext/io.hpp b/lib/extension/dlisio/ext/io.hpp index 0825b3139..77257261b 100644 --- a/lib/extension/dlisio/ext/io.hpp +++ b/lib/extension/dlisio/ext/io.hpp @@ -75,7 +75,8 @@ dl::record& extract(stream&, long long, long long, dl::record&) noexcept (false) stream_offsets findoffsets(dl::stream&, dl::error_handler&) noexcept (false); std::map< dl::ident, std::vector< long long > > -findfdata(dl::stream&, const std::vector< long long >&) noexcept (false); +findfdata(dl::stream&, const std::vector< long long >&, dl::error_handler&) +noexcept (false); } diff --git a/lib/src/io.cpp b/lib/src/io.cpp index 650b33e46..c05eadc88 100644 --- a/lib/src/io.cpp +++ b/lib/src/io.cpp @@ -529,8 +529,8 @@ noexcept (false) { } std::map< dl::ident, std::vector< long long > > -findfdata(dl::stream& file, const std::vector< long long >& tells) -noexcept (false) { +findfdata(dl::stream& file, const std::vector< long long >& tells, +dl::error_handler& errorhandler) noexcept (false) { std::map< dl::ident, std::vector< long long > > xs; constexpr std::size_t OBNAME_SIZE_MAX = 262; @@ -538,8 +538,20 @@ noexcept (false) { record rec; rec.data.reserve( OBNAME_SIZE_MAX ); + const auto handle = [&]( const std::string& problem ) { + const auto context = "dl::findfdata: Indexing implicit records"; + errorhandler.log(dl::error_severity::CRITICAL, context, problem, "", + "Record is skipped"); + }; + for (auto tell : tells) { - extract(file, tell, OBNAME_SIZE_MAX, rec); + try { + extract(file, tell, OBNAME_SIZE_MAX, rec); + } catch (std::exception& e) { + handle(e.what()); + continue; + } + if (rec.isencrypted()) continue; if (rec.type != 0) continue; if (rec.data.size() == 0) continue; @@ -552,8 +564,10 @@ noexcept (false) { std::size_t obname_size = cur - rec.data.data(); if (obname_size > rec.data.size()) { - auto msg = "File corrupted. Error on reading fdata obname"; - throw std::runtime_error(msg); + const auto problem = + "fdata record corrupted, error on reading obname"; + handle(problem); + continue; } dl::obname tmp{ dl::origin{ origin }, dl::ushort{ copy }, diff --git a/python/data/chap3/README.rst b/python/data/chap3/README.rst index 619efeb8f..f8ebce4ee 100644 --- a/python/data/chap3/README.rst +++ b/python/data/chap3/README.rst @@ -25,8 +25,9 @@ start.dlis.part SUL + all the VR and LRSH bytesiBytes 50 and 51 ====================================== ======================================== Files in /implicit Description ====================================== ======================================== -fdata-broken-obname.dlis File is truncated in the obname of fdata - record +fdata-broken-obname.dlis Obname length in first implicit record is + bigger than remaining record length. + Second implicit record is correct fdata-encrypted.dlis Fdata record is encrypted diff --git a/python/data/chap3/implicit/fdata-broken-obname.dlis b/python/data/chap3/implicit/fdata-broken-obname.dlis index 9176347925062e7672406c6fefea0f945e9c350c..4e1638b801a53df227a8b39173d5383cce198d39 100644 GIT binary patch delta 85 zcmXR|z&Ih$Kj1$jgA4=12i_n@KNo*rU8f*_Z&%%5e_z)S4^KaL1_K5LW=1X-AJ1T2 ow;)GfR|Uts6orh`qEvA(Ul*E!m0Q<8T-2eap delta 37 scmZ3$STG?_Ug19@gA4=12i_n@KNo*rU8f*_Z&%%5e_z)S4^KaL0L$PChX4Qo diff --git a/python/dlisio/load.py b/python/dlisio/load.py index 5dfbee3b9..08912266d 100644 --- a/python/dlisio/load.py +++ b/python/dlisio/load.py @@ -132,7 +132,7 @@ def rewind(offset, tif): recs = core.extract(stream, explicits, error_handler) sets = core.parse_objects(recs) pool = core.pool(sets) - fdata = core.findfdata(stream, implicits) + fdata = core.findfdata(stream, implicits, error_handler) lf = logicalfile(stream, pool, fdata, sul, error_handler) lfs.append(lf) diff --git a/python/tests/test_error_handler.py b/python/tests/test_error_handler.py index ee8c672c1..a7ae14c50 100644 --- a/python/tests/test_error_handler.py +++ b/python/tests/test_error_handler.py @@ -71,6 +71,12 @@ def test_extract_broken_padbytes(assert_error): valid_obj = f.object("VALID-SET", "VALID-OBJ", 10, 0) assert valid_obj +def test_findfdata_bad_obname(assert_error): + path = 'data/chap3/implicit/fdata-broken-obname.dlis' + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + assert_error("fdata record corrupted, error on reading obname") + assert "T.FRAME-I.DLIS-FRAME-O.3-C.1" in f.fdata_index + # TODO: test_parse_exeptions def test_parse_critical_escaped(tmpdir, merge_files_oneLR, diff --git a/python/tests/test_logical_record.py b/python/tests/test_logical_record.py index fe6dca085..f8f324887 100644 --- a/python/tests/test_logical_record.py +++ b/python/tests/test_logical_record.py @@ -718,7 +718,8 @@ def test_findfdata_bad_obname(): # Obname is truncated by LRS-length, which in itself is fine as long as it # continues on the next LRS. However, in this case there are no new LRS. - assert 'File corrupted. Error on reading fdata obname' in str(excinfo.value) + msg = 'fdata record corrupted, error on reading obname' + assert msg in str(excinfo.value) def test_unexpected_attribute_in_set(tmpdir, merge_files_oneLR): path = os.path.join(str(tmpdir), 'unexpected-attribute.dlis') From 97c316eaa7b10398b31b282a98bc8df7f6b993e9 Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Wed, 23 Sep 2020 13:20:53 +0200 Subject: [PATCH 18/25] Divide read_fdata function Divide function into two parts to make code for now less difficult to deal with. This is by no means enough and sufficient, and proper refactoring of this code is required. --- python/dlisio/ext/core.cpp | 578 ++++++++++++++++++++----------------- 1 file changed, 308 insertions(+), 270 deletions(-) diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index 0bddb11b7..994ee24ae 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -325,6 +326,311 @@ dl::ident fingerprint(const std::string& type, return ref.fingerprint(); } +void assert_overflow(const char* ptr, const char* end, int skip) { + if (ptr + skip > end) { + const auto msg = "corrupted record: fmtstr would read past end"; + throw std::runtime_error(msg); + } +} + +void read_curve_sample(const char* f, const char*& ptr, const char* end, + unsigned char*& dst) +{ + /* + * Reads to dst buffer a singular curve where: + * f - current fmt character + * ptr - pointer to current position in source buffer (file record) + * end - pointer to end of the source buffer (record end) + * dst - pointer to current position in destination buffer + */ + + /* + * Supporting bounded-length identifiers in frame data is + * slightly more difficult than it immediately seem like, and + * this implementation relies on a few assumptions that may not + * hold. + * + * 1. numpy structured arrays interpret unicode on the fly + * + * On my amd64 linux: + * >>> dt = np.dtype('U5') + * >>> dt.itemsize + * 20 + * >>> np.array(['foo'], dtype = dt)[0] + * 'foo' + * >>> np.array(['foobar'], dtype = dt)[0] + * 'fooba' + * + * Meaning it supports string lengths of [0, n]. It apparently + * (and maybe rightly so) uses null termination, or the bounded + * length, which ever comes first. + * + * 2. numpy stores characters as int32 Py_UNICODE + * Numpy seems to always use uint32, and not Py_UNICODE, which + * can be both 16 and 32 bits [1]. Since it's an integer it's + * endian sensitive, and widening from char works. This is not + * really documented by numpy. + * + * 3. numpy stores no metadata with the string + * It is assumed, and seems necessary from the interface, that + * there is no in-band metadata stored about the strings when + * used in structured arrays. This means we can just write the + * unicode ourselves, and have numpy interpret it correctly. + * + * -- + * Units is just an IDENT in disguise, so it can very well take + * the same code path. + * + * [1] http://docs.h5py.org/en/stable/strings.html#what-about-numpy-s-u-type + * NumPy also has a Unicode type, a UTF-32 fixed-width + * format (4-byte characters). HDF5 has no support for wide + * characters. Rather than trying to hack around this and + * “pretend” to support it, h5py will raise an error when + * attempting to create datasets or attributes of this + * type. + * + */ + auto swap_pointer = [&](py::object obj) + { + PyObject* p; + std::memcpy(&p, dst, sizeof(p)); + Py_DECREF(p); + p = obj.inc_ref().ptr(); + std::memcpy(dst, &p, sizeof(p)); + dst += sizeof(p); + }; + + if (*f == DLIS_FMT_FSING1) { + float v; + float a; + ptr = dlis_fsing1(ptr, &v, &a); + auto t = py::make_tuple(v, a); + + swap_pointer(t); + return; + } + + if (*f == DLIS_FMT_FSING2) { + float v; + float a; + float b; + ptr = dlis_fsing2(ptr, &v, &a, &b); + auto t = py::make_tuple(v, a, b); + + swap_pointer(t); + return; + } + + if (*f == DLIS_FMT_FDOUB1) { + double v; + double a; + ptr = dlis_fdoub1(ptr, &v, &a); + auto t = py::make_tuple(v, a); + + swap_pointer(t); + return; + } + + if (*f == DLIS_FMT_FDOUB2) { + double v; + double a; + double b; + ptr = dlis_fdoub2(ptr, &v, &a, &b); + auto t = py::make_tuple(v, a, b); + + swap_pointer(t); + return; + } + + if (*f == DLIS_FMT_IDENT || *f == DLIS_FMT_UNITS) { + constexpr auto chars = 255; + constexpr auto ident_size = chars * sizeof(std::uint32_t); + + std::int32_t len; + char tmp[chars]; + ptr = dlis_ident(ptr, &len, tmp); + + /* + * From reading the numpy source, it looks like they put + * and interpret the unicode buffer in the array directly, + * and pad with zero. This means the string is both null + * and length terminated, whichever comes first. + */ + std::memset(dst, 0, ident_size); + for (auto i = 0; i < len; ++i) { + const auto x = std::uint32_t(tmp[i]); + std::memcpy(dst + i * sizeof(x), &x, sizeof(x)); + } + dst += ident_size; + return; + } + + if (*f == DLIS_FMT_ASCII) { + std::int32_t len; + ptr = dlis_uvari(ptr, &len); + auto ascii = py::str(ptr, len); + ptr += len; + + /* + * Numpy seems to default initalize object types even in + * the case of np.empty to None [1]. The refcount is surely + * increased, so decref it before replacing the pointer + * with a fresh str. + * + * [1] Array of uninitialized (arbitrary) data of the given + * shape, dtype, and order. Object arrays will be + * initialized to None. + * https://docs.scipy.org/doc/numpy/reference/generated/numpy.empty.html + */ + swap_pointer(ascii); + return; + } + + if (*f == DLIS_FMT_OBNAME) { + std::int32_t origin; + std::uint8_t copy; + std::int32_t idlen; + char id[255]; + ptr = dlis_obname(ptr, &origin, ©, &idlen, id); + + const auto name = dl::obname { + dl::origin(origin), + dl::ushort(copy), + dl::ident(std::string(id, idlen)), + }; + + swap_pointer(py::cast(name)); + return; + } + + if (*f == DLIS_FMT_OBJREF) { + std::int32_t idlen; + char id[255]; + std::int32_t origin; + std::uint8_t copy; + std::int32_t objnamelen; + char objname[255]; + ptr = dlis_objref(ptr, + &idlen, + id, + &origin, + ©, + &objnamelen, + objname); + + const auto name = dl::objref { + dl::ident(std::string(id, idlen)), + dl::obname { + dl::origin(origin), + dl::ushort(copy), + dl::ident(std::string(objname, objnamelen)), + }, + }; + + swap_pointer(py::cast(name)); + return; + } + + if (*f == DLIS_FMT_ATTREF) { + std::int32_t id1len; + char id1[255]; + std::int32_t origin; + std::uint8_t copy; + std::int32_t objnamelen; + char objname[255]; + std::int32_t id2len; + char id2[255]; + ptr = dlis_attref(ptr, + &id1len, + id1, + &origin, + ©, + &objnamelen, + objname, + &id2len, + id2); + + const auto ref = dl::attref { + dl::ident(std::string(id1, id1len)), + dl::obname { + dl::origin(origin), + dl::ushort(copy), + dl::ident(std::string(objname, objnamelen)), + }, + dl::ident(std::string(id2, id2len)), + }; + + swap_pointer(py::cast(ref)); + return; + } + + if (*f == DLIS_FMT_DTIME) { + int Y, TZ, M, D, H, MN, S, MS; + ptr = dlis_dtime(ptr, &Y, &TZ, &M, &D, &H, &MN, &S, &MS); + Y = dlis_year(Y); + const auto US = MS * 1000; + + PyObject* p; + std::memcpy(&p, dst, sizeof(p)); + Py_DECREF(p); + p = PyDateTime_FromDateAndTime(Y, M, D, H, MN, S, US); + if (!p) throw py::error_already_set(); + std::memcpy(dst, &p, sizeof(p)); + dst += sizeof(p); + return; + } + + int src_skip, dst_skip; + const char localfmt[] = {*f, '\0'}; + dlis_packflen(localfmt, ptr, &src_skip, &dst_skip); + assert_overflow(ptr, end, src_skip); + dlis_packf(localfmt, ptr, dst); + dst += dst_skip; + ptr += src_skip; +} + +void read_fdata_frame(const char* fmt, const char*& ptr, const char* end, + unsigned char*& dst) noexcept (false) { + for (auto* f = fmt; *f; ++f) { + read_curve_sample(f, ptr, end, dst); + } +} + +void read_fdata_record(const char* pre_fmt, + const char* fmt, + const char* post_fmt, + const char* ptr, + const char* end, + unsigned char*& dst, + int& frames, + const std::size_t& itemsize, + std::size_t allocated_rows, + std::function resize) +noexcept (false) { + + /* get frame number and slots */ + while (ptr < end) { + if (frames == allocated_rows) { + resize(frames * 2); + dst += (frames * itemsize); + } + + int src_skip, dst_skip; + + dlis_packflen(pre_fmt, ptr, &src_skip, nullptr); + assert_overflow(ptr, end, src_skip); + ptr += src_skip; + + read_fdata_frame(fmt, ptr, end, dst); + + dlis_packflen(post_fmt, ptr, &src_skip, nullptr); + assert_overflow(ptr, end, src_skip); + ptr += src_skip; + + ++frames; + } +} + py::object read_fdata(const char* pre_fmt, const char* fmt, const char* post_fmt, @@ -409,276 +715,8 @@ noexcept (false) { std::uint8_t copy; ptr = dlis_obname(ptr, &origin, ©, nullptr, nullptr); - /* get frame number and slots */ - while (ptr < end) { - if (frames == allocated_rows) { - resize(frames * 2); - dst += (frames * itemsize); - } - - auto assert_overflow = [end](const char* ptr, int skip) { - if (ptr + skip > end) { - const auto msg = "corrupted record: fmtstr would read past end"; - throw std::runtime_error(msg); - } - }; - - int src_skip, dst_skip; - dlis_packflen(pre_fmt, ptr, &src_skip, nullptr); - assert_overflow(ptr, src_skip); - ptr += src_skip; - - for (auto* f = fmt; *f; ++f) { - /* - * Supporting bounded-length identifiers in frame data is - * slightly more difficult than it immediately seem like, and - * this implementation relies on a few assumptions that may not - * hold. - * - * 1. numpy structured arrays interpret unicode on the fly - * - * On my amd64 linux: - * >>> dt = np.dtype('U5') - * >>> dt.itemsize - * 20 - * >>> np.array(['foo'], dtype = dt)[0] - * 'foo' - * >>> np.array(['foobar'], dtype = dt)[0] - * 'fooba' - * - * Meaning it supports string lengths of [0, n]. It apparently - * (and maybe rightly so) uses null termination, or the bounded - * length, which ever comes first. - * - * 2. numpy stores characters as int32 Py_UNICODE - * Numpy seems to always use uint32, and not Py_UNICODE, which - * can be both 16 and 32 bits [1]. Since it's an integer it's - * endian sensitive, and widening from char works. This is not - * really documented by numpy. - * - * 3. numpy stores no metadata with the string - * It is assumed, and seems necessary from the interface, that - * there is no in-band metadata stored about the strings when - * used in structured arrays. This means we can just write the - * unicode ourselves, and have numpy interpret it correctly. - * - * -- - * Units is just an IDENT in disguise, so it can very well take - * the same code path. - * - * [1] http://docs.h5py.org/en/stable/strings.html#what-about-numpy-s-u-type - * NumPy also has a Unicode type, a UTF-32 fixed-width - * format (4-byte characters). HDF5 has no support for wide - * characters. Rather than trying to hack around this and - * “pretend” to support it, h5py will raise an error when - * attempting to create datasets or attributes of this - * type. - * - */ - auto swap_pointer = [&](py::object obj) - { - PyObject* p; - std::memcpy(&p, dst, sizeof(p)); - Py_DECREF(p); - p = obj.inc_ref().ptr(); - std::memcpy(dst, &p, sizeof(p)); - dst += sizeof(p); - }; - - if (*f == DLIS_FMT_FSING1) { - float v; - float a; - ptr = dlis_fsing1(ptr, &v, &a); - auto t = py::make_tuple(v, a); - - swap_pointer(t); - continue; - } - - if (*f == DLIS_FMT_FSING2) { - float v; - float a; - float b; - ptr = dlis_fsing2(ptr, &v, &a, &b); - auto t = py::make_tuple(v, a, b); - - swap_pointer(t); - continue; - } - - if (*f == DLIS_FMT_FDOUB1) { - double v; - double a; - ptr = dlis_fdoub1(ptr, &v, &a); - auto t = py::make_tuple(v, a); - - swap_pointer(t); - continue; - } - - if (*f == DLIS_FMT_FDOUB2) { - double v; - double a; - double b; - ptr = dlis_fdoub2(ptr, &v, &a, &b); - auto t = py::make_tuple(v, a, b); - - swap_pointer(t); - continue; - } - - if (*f == DLIS_FMT_IDENT || *f == DLIS_FMT_UNITS) { - constexpr auto chars = 255; - constexpr auto ident_size = chars * sizeof(std::uint32_t); - - std::int32_t len; - char tmp[chars]; - ptr = dlis_ident(ptr, &len, tmp); - - /* - * From reading the numpy source, it looks like they put - * and interpret the unicode buffer in the array directly, - * and pad with zero. This means the string is both null - * and length terminated, whichever comes first. - */ - std::memset(dst, 0, ident_size); - for (auto i = 0; i < len; ++i) { - const auto x = std::uint32_t(tmp[i]); - std::memcpy(dst + i * sizeof(x), &x, sizeof(x)); - } - dst += ident_size; - continue; - } - - if (*f == DLIS_FMT_ASCII) { - std::int32_t len; - ptr = dlis_uvari(ptr, &len); - auto ascii = py::str(ptr, len); - ptr += len; - - /* - * Numpy seems to default initalize object types even in - * the case of np.empty to None [1]. The refcount is surely - * increased, so decref it before replacing the pointer - * with a fresh str. - * - * [1] Array of uninitialized (arbitrary) data of the given - * shape, dtype, and order. Object arrays will be - * initialized to None. - * https://docs.scipy.org/doc/numpy/reference/generated/numpy.empty.html - */ - swap_pointer(ascii); - continue; - } - - if (*f == DLIS_FMT_OBNAME) { - std::int32_t origin; - std::uint8_t copy; - std::int32_t idlen; - char id[255]; - ptr = dlis_obname(ptr, &origin, ©, &idlen, id); - - const auto name = dl::obname { - dl::origin(origin), - dl::ushort(copy), - dl::ident(std::string(id, idlen)), - }; - - swap_pointer(py::cast(name)); - continue; - } - - if (*f == DLIS_FMT_OBJREF) { - std::int32_t idlen; - char id[255]; - std::int32_t origin; - std::uint8_t copy; - std::int32_t objnamelen; - char objname[255]; - ptr = dlis_objref(ptr, - &idlen, - id, - &origin, - ©, - &objnamelen, - objname); - - const auto name = dl::objref { - dl::ident(std::string(id, idlen)), - dl::obname { - dl::origin(origin), - dl::ushort(copy), - dl::ident(std::string(objname, objnamelen)), - }, - }; - - swap_pointer(py::cast(name)); - continue; - } - - if (*f == DLIS_FMT_ATTREF) { - std::int32_t id1len; - char id1[255]; - std::int32_t origin; - std::uint8_t copy; - std::int32_t objnamelen; - char objname[255]; - std::int32_t id2len; - char id2[255]; - ptr = dlis_attref(ptr, - &id1len, - id1, - &origin, - ©, - &objnamelen, - objname, - &id2len, - id2); - - const auto ref = dl::attref { - dl::ident(std::string(id1, id1len)), - dl::obname { - dl::origin(origin), - dl::ushort(copy), - dl::ident(std::string(objname, objnamelen)), - }, - dl::ident(std::string(id2, id2len)), - }; - - swap_pointer(py::cast(ref)); - continue; - } - - if (*f == DLIS_FMT_DTIME) { - int Y, TZ, M, D, H, MN, S, MS; - ptr = dlis_dtime(ptr, &Y, &TZ, &M, &D, &H, &MN, &S, &MS); - Y = dlis_year(Y); - const auto US = MS * 1000; - - PyObject* p; - std::memcpy(&p, dst, sizeof(p)); - Py_DECREF(p); - p = PyDateTime_FromDateAndTime(Y, M, D, H, MN, S, US); - if (!p) throw py::error_already_set(); - std::memcpy(dst, &p, sizeof(p)); - dst += sizeof(p); - continue; - } - - const char localfmt[] = {*f, '\0'}; - dlis_packflen(localfmt, ptr, &src_skip, &dst_skip); - assert_overflow(ptr, src_skip); - dlis_packf(localfmt, ptr, dst); - dst += dst_skip; - ptr += src_skip; - } - - dlis_packflen(post_fmt, ptr, &src_skip, nullptr); - assert_overflow(ptr, src_skip); - ptr += src_skip; - - ++frames; - } + read_fdata_record(pre_fmt, fmt, post_fmt, ptr, end, dst, frames, + itemsize, allocated_rows, resize); } assert(allocated_rows >= frames); From dd927f6f3e676579f2a84407391718ac0173dd48 Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Thu, 15 Oct 2020 12:45:48 +0200 Subject: [PATCH 19/25] Add error handler to readfdata Note: same problems with implementation as for findoffsets and others. Agreement regarding try-catch size, use of throw and methods extraction wasn't unanimous. --- python/data/chap4-7/README.rst | 4 +++ python/data/chap4-7/iflr/broken-fmt.dlis | Bin 0 -> 414 bytes python/dlisio/dlisutils.py | 1 + python/dlisio/ext/core.cpp | 37 ++++++++++++++++++++--- python/tests/test_curves_reprcodes.py | 6 ++++ python/tests/test_error_handler.py | 9 ++++++ 6 files changed, 52 insertions(+), 5 deletions(-) create mode 100644 python/data/chap4-7/iflr/broken-fmt.dlis diff --git a/python/data/chap4-7/README.rst b/python/data/chap4-7/README.rst index b4575316a..ed6dc81aa 100644 --- a/python/data/chap4-7/README.rst +++ b/python/data/chap4-7/README.rst @@ -93,6 +93,10 @@ broken-ascii.dlis Ascii with incorrect specified characters are expected, but only 20 are actually present +broken-fmt.dlis There is less data in first fdata record than it + should be according to fmt string. Second IFLR is + correct. + broken-utf8-ascii.dlis Ascii containing non-utf8 characters duplicate-framenos.dlis 2 FDATA containing different values, but they have diff --git a/python/data/chap4-7/iflr/broken-fmt.dlis b/python/data/chap4-7/iflr/broken-fmt.dlis new file mode 100644 index 0000000000000000000000000000000000000000..26312ba78a3c4c67a5ea002122f7fb4f0aa784d5 GIT binary patch literal 414 zcma)$K~KU!5QVpZQ8AIIBqqkg^ys06B{jgYbcfK;ZgmS@&4zAClg5&E@#KM@+<%}z z%1I8+(fhXWSaQ zA6o9TN6mVp1M`Lqx2@`IBAfv2iy9T(?~>OC&)P%xq^Cw*J_sFV!Z literal 0 HcmV?d00001 diff --git a/python/dlisio/dlisutils.py b/python/dlisio/dlisutils.py index 97cbbeaa2..dd668fe02 100644 --- a/python/dlisio/dlisutils.py +++ b/python/dlisio/dlisutils.py @@ -24,4 +24,5 @@ def curves(dlis, frame, dtype, pre_fmt, fmt, post_fmt): indices, dtype.itemsize, alloc, + dlis.error_handler ) diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index 994ee24ae..9b82525b8 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -637,7 +637,8 @@ py::object read_fdata(const char* pre_fmt, dl::stream& file, const std::vector< long long >& indices, std::size_t itemsize, - py::object alloc) + py::object alloc, + dl::error_handler& errorhandler) noexcept (false) { // TODO: reverse fingerprint to skip bytes ahead-of-time /* @@ -688,6 +689,7 @@ noexcept (false) { dst = static_cast< unsigned char* >(info.ptr); }; + /* * The frameno is a part of the dtype, and only frame.channels is allowed * to call this function. If pre/post format is set, wrong data is read. @@ -698,13 +700,33 @@ noexcept (false) { assert(std::string(pre_fmt) == ""); assert(std::string(post_fmt) == ""); + auto record_dst = dst; + + const auto handle = [&]( const std::string& problem ) { + const auto context = "dl::read_fdata: reading curves"; + errorhandler.log(dl::error_severity::CRITICAL, context, problem, "", + "Record is skipped"); + // we update the buffer as we go. Hence if error happened we need to + // go back and start rewriting updated data + dst = record_dst; + }; + int frames = 0; for (auto i : indices) { + record_dst = dst; + /* get record */ - auto record = dl::extract(file, i); + dl::record record; + try { + record = dl::extract(file, i); + } catch (std::exception& e) { + handle(e.what()); + continue; + } if (record.isencrypted()) { - throw dl::not_implemented("encrypted FDATA record"); + handle("encrypted FDATA record"); + continue; } const auto* ptr = record.data.data(); @@ -715,8 +737,13 @@ noexcept (false) { std::uint8_t copy; ptr = dlis_obname(ptr, &origin, ©, nullptr, nullptr); - read_fdata_record(pre_fmt, fmt, post_fmt, ptr, end, dst, frames, - itemsize, allocated_rows, resize); + try { + read_fdata_record(pre_fmt, fmt, post_fmt, ptr, end, dst, frames, + itemsize, allocated_rows, resize); + } catch (std::exception& e) { + handle(e.what()); + continue; + } } assert(allocated_rows >= frames); diff --git a/python/tests/test_curves_reprcodes.py b/python/tests/test_curves_reprcodes.py index 4d4209151..534b74b2a 100644 --- a/python/tests/test_curves_reprcodes.py +++ b/python/tests/test_curves_reprcodes.py @@ -385,3 +385,9 @@ def test_ascii_broken(): def test_ascii_broken_utf8(): fpath = 'data/chap4-7/iflr/broken-utf8-ascii.dlis' _ = load_curves(fpath) + +def test_fmt_broken(): + fpath = 'data/chap4-7/iflr/broken-fmt.dlis' + with pytest.raises(RuntimeError) as exc: + _ = load_curves(fpath) + assert "fmtstr would read past end" in str(exc.value) diff --git a/python/tests/test_error_handler.py b/python/tests/test_error_handler.py index a7ae14c50..cf340b18e 100644 --- a/python/tests/test_error_handler.py +++ b/python/tests/test_error_handler.py @@ -5,6 +5,7 @@ import pytest import os +import numpy as np import dlisio @@ -77,6 +78,14 @@ def test_findfdata_bad_obname(assert_error): assert_error("fdata record corrupted, error on reading obname") assert "T.FRAME-I.DLIS-FRAME-O.3-C.1" in f.fdata_index +def test_curves_broken_fmt(assert_error): + path = 'data/chap4-7/iflr/broken-fmt.dlis' + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + frame = f.object('FRAME', 'FRAME-REPRCODE', 10, 0) + curves = frame.curves() + assert_error("fmtstr would read past end") + assert np.array_equal(curves['FRAMENO'], np.array([1, 3])) + # TODO: test_parse_exeptions def test_parse_critical_escaped(tmpdir, merge_files_oneLR, From 2de34f15825722d9347fe5eaa812cc7bab4dfb49 Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Wed, 18 Nov 2020 14:18:00 +0100 Subject: [PATCH 20/25] Add error handler to parse We already escape recoverable errors by storing them inside objects and attributes. Now we also can escape unparsable errors by returning everything we were able to process before. --- lib/extension/dlisio/ext/types.hpp | 2 +- lib/src/parse.cpp | 26 +++++++++++++++++++++----- python/tests/test_error_handler.py | 27 +++++++++++++++++++++++++++ python/tests/test_logical_record.py | 10 +++++----- 4 files changed, 54 insertions(+), 11 deletions(-) diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index 5c5617c9f..38476d724 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -580,7 +580,7 @@ struct object_set { dl::object_vector objs; dl::object_template tmpl; - void parse() noexcept (false); + void parse() noexcept (true); bool parsed = false; const char* parse_set_component(const char* cur) noexcept (false); diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index 5f6cec809..7c391f9f0 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -1139,15 +1139,31 @@ object_set::object_set(dl::record rec) noexcept (false) { parse_set_component(this->record.data.data()); } -void object_set::parse() noexcept (false) { +void object_set::parse() noexcept (true) { if (this->parsed) return; const char* cur = this->record.data.data(); - /* As cursor value is not stored, read data again to get the position */ - cur = parse_set_component(cur); - cur = parse_template(cur); - parse_objects(cur); + try { + /* As cursor value is not stored, read data again to get the position */ + cur = parse_set_component(cur); + cur = parse_template(cur); + parse_objects(cur); + } catch (const std::exception& e) { + dlis_error err { + dl::error_severity::CRITICAL, + e.what(), + "", + "object set parse has been interrupted" + }; + this->log.push_back(err); + } + // TODO: possible assert that cur == end of data + + /* If set is parsed in default mode, exception will be thrown and set won't + * be considered parsed. If set is parsed after that in error-escape mode, + * parsing will be locked, but error will get stored on object set anyway. + */ this->parsed = true; } diff --git a/python/tests/test_error_handler.py b/python/tests/test_error_handler.py index cf340b18e..326303aa3 100644 --- a/python/tests/test_error_handler.py +++ b/python/tests/test_error_handler.py @@ -105,6 +105,33 @@ def test_parse_critical_escaped(tmpdir, merge_files_oneLR, _ = obj['INVALID'] assert_error("invalid representation code") + +def test_parse_unparsable_record(tmpdir, merge_files_oneLR, assert_error): + path = os.path.join(str(tmpdir), 'unparsable.dlis') + content = [ + 'data/chap3/start.dlis.part', + 'data/chap3/template/invalid-repcode-no-value.dlis.part', + 'data/chap3/object/object.dlis.part', + 'data/chap3/objattr/empty.dlis.part', + 'data/chap3/object/object2.dlis.part', + # here must go anything that will be considered unrecoverable + 'data/chap3/objattr/reprcode-invalid-value.dlis.part' + ] + merge_files_oneLR(path, content) + + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) + assert_error("Action taken: object set parse has been interrupted") + + # value is unclear and shouldn't be trusted + _ = obj['INVALID'] + assert_error("invalid representation code") + + with pytest.raises(ValueError) as excinfo: + _ = f.object('VERY_MUCH_TESTY_SET', 'OBJECT2', 1, 1) + assert "not found" in str(excinfo.value) + + def test_parse_major_errored(tmpdir, merge_files_oneLR): path = os.path.join(str(tmpdir), 'replacement-set.dlis') content = [ diff --git a/python/tests/test_logical_record.py b/python/tests/test_logical_record.py index f8f324887..ad4c4fb9e 100644 --- a/python/tests/test_logical_record.py +++ b/python/tests/test_logical_record.py @@ -617,7 +617,7 @@ def test_novalue_more_count(tmpdir, merge_files_oneLR): merge_files_oneLR(path, content) with dlisio.load(path) as (f, *_): - with pytest.raises(NotImplementedError): + with pytest.raises(RuntimeError): f.load() def test_set_redundat(tmpdir, merge_files_oneLR, assert_info): @@ -748,7 +748,7 @@ def test_unexpected_set_in_object(tmpdir, merge_files_oneLR): merge_files_oneLR(path, content) with dlisio.load(path) as (f, *_): - with pytest.raises(ValueError) as excinfo: + with pytest.raises(RuntimeError) as excinfo: f.load() assert "expected ATTRIB" in str(excinfo.value) @@ -765,7 +765,7 @@ def test_unexpected_set_in_template(tmpdir, merge_files_oneLR): merge_files_oneLR(path, content) with dlisio.load(path) as (f, *_): - with pytest.raises(ValueError) as excinfo: + with pytest.raises(RuntimeError) as excinfo: f.load() assert "expected ATTRIB" in str(excinfo.value) @@ -782,7 +782,7 @@ def test_unexpected_attribute_instead_of_object(tmpdir, merge_files_oneLR): merge_files_oneLR(path, content) with dlisio.load(path) as (f, *_): - with pytest.raises(ValueError) as excinfo: + with pytest.raises(RuntimeError) as excinfo: f.load() assert "expected OBJECT" in str(excinfo.value) @@ -797,7 +797,7 @@ def test_cut_before_template(tmpdir, merge_files_oneLR): merge_files_oneLR(path, content) with dlisio.load(path) as (f, *_): - with pytest.raises(IndexError) as excinfo: + with pytest.raises(RuntimeError) as excinfo: f.load() assert "unexpected end-of-record" in str(excinfo.value) From 99d0127c04b2ba56400e7e99e293755e874866b6 Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Fri, 6 Nov 2020 13:43:47 +0100 Subject: [PATCH 21/25] Add error handler to parse_objects --- python/data/chap3/README.rst | 11 +++++++++++ python/data/chap3/explicit/broken-in-set.dlis | Bin 0 -> 228 bytes python/dlisio/ext/core.cpp | 14 ++++++++++++-- python/dlisio/load.py | 2 +- python/tests/test_error_handler.py | 8 ++++++-- python/tests/test_logical_record.py | 2 +- 6 files changed, 31 insertions(+), 6 deletions(-) create mode 100644 python/data/chap3/explicit/broken-in-set.dlis diff --git a/python/data/chap3/README.rst b/python/data/chap3/README.rst index f8ebce4ee..36d203944 100644 --- a/python/data/chap3/README.rst +++ b/python/data/chap3/README.rst @@ -58,6 +58,17 @@ fdata-vr-obname-trailing.dlis Fdata obname is interrupted by Visible ====================================== ======================================== +*explicit* folder contains full files. + +====================================== ======================================== +Files in /explicit Description +====================================== ======================================== +broken-in-set.dlis First EFLR is correct, second EFLR is + broken in set component (not a set), + third EFLR is correct +====================================== ======================================== + + Creating a valid .dlis-file --------------------------- diff --git a/python/data/chap3/explicit/broken-in-set.dlis b/python/data/chap3/explicit/broken-in-set.dlis new file mode 100644 index 0000000000000000000000000000000000000000..a1502f83a45ad2cf5034eef0205a72e40d250535 GIT binary patch literal 228 zcmY#TP%sQL)H5&$a&`6(a#64_v@~)_O-n4zDNzV6$uCMwPgMv`Em0sAFiiQ+$Y9Wr z@qshU(Z|z8H`q1AfHlA|$kDfeivcR;@8qS(4HnY{DhhQ4sb}DHa&&YAt6xP3K+Jdd J_xC5xd;n(CM2!Fd literal 0 HcmV?d00001 diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index 9b82525b8..4db0bd130 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -1084,11 +1084,21 @@ PYBIND11_MODULE(core, m) { return recs; }); - m.def( "parse_objects", []( const std::vector< dl::record >& recs ) { + m.def( "parse_objects", []( const std::vector< dl::record >& recs, + dl::error_handler& errorhandler ) { std::vector< dl::object_set > objects; for (const auto& rec : recs) { if (rec.isencrypted()) continue; - objects.push_back( dl::object_set( rec ) ); + + try { + objects.push_back( dl::object_set( rec ) ); + } catch (const std::exception& e) { + const auto context = + "core.parse_objects: Construct object sets"; + errorhandler.log(dl::error_severity::CRITICAL, context, + e.what(), "", "Set is skipped"); + continue; + } } return objects; }); diff --git a/python/dlisio/load.py b/python/dlisio/load.py index 08912266d..1c3d49850 100644 --- a/python/dlisio/load.py +++ b/python/dlisio/load.py @@ -130,7 +130,7 @@ def rewind(offset, tif): hint = rewind(stream.absolute_tell, tapemarks) recs = core.extract(stream, explicits, error_handler) - sets = core.parse_objects(recs) + sets = core.parse_objects(recs, error_handler) pool = core.pool(sets) fdata = core.findfdata(stream, implicits, error_handler) diff --git a/python/tests/test_error_handler.py b/python/tests/test_error_handler.py index 326303aa3..4e564034c 100644 --- a/python/tests/test_error_handler.py +++ b/python/tests/test_error_handler.py @@ -86,7 +86,12 @@ def test_curves_broken_fmt(assert_error): assert_error("fmtstr would read past end") assert np.array_equal(curves['FRAMENO'], np.array([1, 3])) -# TODO: test_parse_exeptions +def test_parse_objects_unexpected_attribute_in_set(assert_error): + path = 'data/chap3/explicit/broken-in-set.dlis' + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + assert_error("Construct object sets") + _ = f.object('VALID-SET', 'VALID-OBJ', 10, 0) + _ = f.object('GOOOD-SET', 'VALID-OBJ', 10, 0) def test_parse_critical_escaped(tmpdir, merge_files_oneLR, assert_error): @@ -131,7 +136,6 @@ def test_parse_unparsable_record(tmpdir, merge_files_oneLR, assert_error): _ = f.object('VERY_MUCH_TESTY_SET', 'OBJECT2', 1, 1) assert "not found" in str(excinfo.value) - def test_parse_major_errored(tmpdir, merge_files_oneLR): path = os.path.join(str(tmpdir), 'replacement-set.dlis') content = [ diff --git a/python/tests/test_logical_record.py b/python/tests/test_logical_record.py index ad4c4fb9e..eb5a38448 100644 --- a/python/tests/test_logical_record.py +++ b/python/tests/test_logical_record.py @@ -731,7 +731,7 @@ def test_unexpected_attribute_in_set(tmpdir, merge_files_oneLR): ] merge_files_oneLR(path, content) - with pytest.raises(ValueError) as excinfo: + with pytest.raises(RuntimeError) as excinfo: dlisio.load(path) assert "expected SET" in str(excinfo.value) From db83c9527a756eea0872660e1669e4d228e0bdd2 Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Fri, 6 Nov 2020 15:06:17 +0100 Subject: [PATCH 22/25] Fix object not found error message Invalid value was printed when origin or copynumber where 0. --- python/dlisio/file.py | 4 ++-- python/tests/test_logical_file.py | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/python/dlisio/file.py b/python/dlisio/file.py index 89710594c..f0366ac9b 100644 --- a/python/dlisio/file.py +++ b/python/dlisio/file.py @@ -452,8 +452,8 @@ def object(self, type, name, origin=None, copynr=None): if len(matches) == 0: msg = "Object not found: type={}, name={}, origin={}, copynumber={}" - if not origin: origin = "'any'" - if not copynr: copynr = "'any'" + if origin is None: origin = "'any'" + if copynr is None: copynr = "'any'" raise ValueError(msg.format(type, name, origin, copynr)) diff --git a/python/tests/test_logical_file.py b/python/tests/test_logical_file.py index 842e96662..b0751ca6c 100644 --- a/python/tests/test_logical_file.py +++ b/python/tests/test_logical_file.py @@ -28,7 +28,8 @@ def test_object_unknown(f): def test_object_nonexisting(f): with pytest.raises(ValueError) as exc: _ = f.object("UNKNOWN_TYPE", "SOME_OBJECT", 0, 0) - assert "Object not found" in str(exc.value) + assert ("Object not found: type=UNKNOWN_TYPE, name=SOME_OBJECT, " + "origin=0, copynumber=0") in str(exc.value) with pytest.raises(ValueError): _ = f.object("CHANNEL", "CHANN1", 11, 0) @@ -61,6 +62,8 @@ def test_object_many_objects_nameonly(tmpdir_factory, merge_files_manyLR): with pytest.raises(ValueError) as exc: _ = f.object("CHANNEL", "MATCH1") assert "Candidates are" in str(exc.value) + assert "origin=16, copy=0" in str(exc.value) + assert "origin=127, copy=0" in str(exc.value) def test_object_ducplicated_object(tmpdir_factory, merge_files_manyLR): # Spec violation: Two objects with the same signature AND identical content From b816b30b1b63ee322960912b7f61486e90bf4ae2 Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Wed, 23 Sep 2020 16:58:22 +0200 Subject: [PATCH 23/25] Handle "count is larger" issue in parse Issue wasn't implemented, but now we can just log an error on the attribute and continue processing given that we believe that value is not present. If that doesn't hold and exception is thrown, it will be later handled, reported and processed objects returned. --- lib/src/parse.cpp | 22 +++++++++++++++------- python/tests/test_logical_record.py | 15 +++++++++++---- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index 7c391f9f0..c61bfa156 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -828,14 +828,22 @@ noexcept (false) } /* - * count is larger, so insert default values, maybe? for now, throw - * exception and consider what to do when a file actually uses this - * behaviour + * count is larger, which makes little sense. Likely file is already + * spoiled, but mark attribute as errored and attempt to continue */ - const auto msg = "object attribute without no explicit value, but " - "count (which is {}) > size (which is {})" - ; - throw dl::not_implemented(fmt::format(msg, count, size)); + const auto msg = + "template value is not overridden by object attribute, but " + "count is. count ({}) > template count ({})"; + + dlis_error err { + dl::error_severity::CRITICAL, + fmt::format(msg, count, size), + "3.2.2.1 Component Descriptor: The number of Elements that " + "make up the Value is specified by the Count " + "Characteristic.", + "value is left as in template" + }; + attr.log.push_back(err); } /* diff --git a/python/tests/test_logical_record.py b/python/tests/test_logical_record.py index eb5a38448..a58a5e248 100644 --- a/python/tests/test_logical_record.py +++ b/python/tests/test_logical_record.py @@ -605,20 +605,27 @@ def test_novalue_less_count(tmpdir, merge_files_oneLR, assert_log): assert_log("< template count") -@pytest.mark.not_implemented def test_novalue_more_count(tmpdir, merge_files_oneLR): path = os.path.join(str(tmpdir), 'novalue-more-count.dlis') content = [ 'data/chap3/start.dlis.part', 'data/chap3/template/default.dlis.part', 'data/chap3/object/object.dlis.part', - 'data/chap3/objattr/count9-novalue.dlis.part' + 'data/chap3/objattr/count9-novalue.dlis.part', + 'data/chap3/object/object2.dlis.part', + 'data/chap3/objattr/all-set.dlis.part' ] merge_files_oneLR(path, content) with dlisio.load(path) as (f, *_): - with pytest.raises(RuntimeError): - f.load() + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) + with pytest.raises(RuntimeError) as excinfo: + _ = obj['DEFAULT_ATTRIBUTE'] + assert "> template count" in str(excinfo.value) + + obj2 = f.object('VERY_MUCH_TESTY_SET', 'OBJECT2', 1, 1) + attr = obj2['DEFAULT_ATTRIBUTE'] + assert attr == [1, 2, 3, 4] def test_set_redundat(tmpdir, merge_files_oneLR, assert_info): path = os.path.join(str(tmpdir), 'redundant-set.dlis') From 9ca59022a6f03851d9d78efeb52a0d511f7a2fa8 Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Thu, 15 Oct 2020 16:45:01 +0200 Subject: [PATCH 24/25] Inform user when padbytes number equals LRS length We now can inform user about a situation where padbytes number was the same as LRS length. Previously that information was skipped, we barely assumed dlisio deals with this correctly. Similar way we can deal with other non-implemented warnings in io.cpp. --- lib/extension/dlisio/ext/io.hpp | 5 ++-- lib/src/io.cpp | 36 +++++++++++++++++----------- python/dlisio/ext/core.cpp | 4 ++-- python/tests/test_physical_layout.py | 3 ++- 4 files changed, 29 insertions(+), 19 deletions(-) diff --git a/lib/extension/dlisio/ext/io.hpp b/lib/extension/dlisio/ext/io.hpp index 77257261b..d65c0b4b7 100644 --- a/lib/extension/dlisio/ext/io.hpp +++ b/lib/extension/dlisio/ext/io.hpp @@ -69,8 +69,9 @@ long long findsul(stream&) noexcept (false); long long findvrl(stream&, long long) noexcept (false); bool hastapemark(stream&) noexcept (false); -dl::record extract(stream&, long long) noexcept (false); -dl::record& extract(stream&, long long, long long, dl::record&) noexcept (false); +dl::record extract(stream&, long long, dl::error_handler&) noexcept (false); +dl::record& extract(stream&, long long, long long, dl::record&, + dl::error_handler&) noexcept (false); stream_offsets findoffsets(dl::stream&, dl::error_handler&) noexcept (false); diff --git a/lib/src/io.cpp b/lib/src/io.cpp index c05eadc88..656fe5c87 100644 --- a/lib/src/io.cpp +++ b/lib/src/io.cpp @@ -177,7 +177,8 @@ bool type_consistent( const T& ) noexcept (true) { void trim_segment(std::uint8_t attrs, const char* begin, int segment_size, - std::vector< char >& segment) + std::vector< char >& segment, + dl::error_handler& errorhandler) noexcept (false) { int trim = 0; const auto* end = begin + segment_size; @@ -188,22 +189,27 @@ noexcept (false) { segment.resize(segment.size() - trim); return; - case DLIS_BAD_SIZE: + case DLIS_BAD_SIZE: { if (trim - segment_size != DLIS_LRSH_SIZE) { const auto msg = - "bad segment trim: padbytes (which is {}) " + "bad segment trim: trim size (which is {}) " ">= segment.size() (which is {})"; throw std::runtime_error(fmt::format(msg, trim, segment_size)); } - /* - * padbytes included the segment header. It's larger than - * the segment body, but only because it also counts the - * header. accept that, pretend the body was never added, - * and move on. - */ + errorhandler.log( + dl::error_severity::MINOR, + "extract (trim_segment)", + "trim size (padbytes + checksum + trailing length) = logical " + "record segment length", + "[from 2.2.2.1 Logical Record Segment Header (LRSH) and " + "2.2.2.4 Logical Record Segment Trailer (LRST) situation " + "should be impossible]", + "Segment is skipped"); + segment.resize(segment.size() - segment_size); return; + } default: throw std::invalid_argument("dlis_trim_record_segment"); @@ -299,14 +305,16 @@ template < typename T > using shortvec = std::basic_string< T >; -record extract(stream& file, long long tell) noexcept (false) { +record extract(stream& file, long long tell, error_handler& errorhandler) +noexcept (false) { record rec; rec.data.reserve( 8192 ); auto nbytes = std::numeric_limits< std::int64_t >::max(); - return extract(file, tell, nbytes, rec); + return extract(file, tell, nbytes, rec, errorhandler); } -record& extract(stream& file, long long tell, long long bytes, record& rec) noexcept (false) { +record& extract(stream& file, long long tell, long long bytes, record& rec, + error_handler& errorhandler) noexcept (false) { shortvec< std::uint8_t > attributes; shortvec< int > types; bool consistent = true; @@ -359,7 +367,7 @@ record& extract(stream& file, long long tell, long long bytes, record& rec) noex */ const auto* fst = rec.data.data() + prevsize; - trim_segment(attrs, fst, len, rec.data); + trim_segment(attrs, fst, len, rec.data, errorhandler); /* if the whole segment is getting trimmed, it's unclear if successor * attribute should be erased or not. For now ignoring. Suspecting @@ -546,7 +554,7 @@ dl::error_handler& errorhandler) noexcept (false) { for (auto tell : tells) { try { - extract(file, tell, OBNAME_SIZE_MAX, rec); + extract(file, tell, OBNAME_SIZE_MAX, rec, errorhandler); } catch (std::exception& e) { handle(e.what()); continue; diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index 4db0bd130..138db652d 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -718,7 +718,7 @@ noexcept (false) { /* get record */ dl::record record; try { - record = dl::extract(file, i); + record = dl::extract(file, i, errorhandler); } catch (std::exception& e) { handle(e.what()); continue; @@ -1069,7 +1069,7 @@ PYBIND11_MODULE(core, m) { for (auto tell : tells) { dl::record rec; try { - rec = dl::extract(s, tell); + rec = dl::extract(s, tell, errorhandler); } catch (const std::exception& e) { const auto context = "dl::extract: Reading raw bytes from record"; diff --git a/python/tests/test_physical_layout.py b/python/tests/test_physical_layout.py index e3fce76ef..4d2065754 100644 --- a/python/tests/test_physical_layout.py +++ b/python/tests/test_physical_layout.py @@ -48,11 +48,12 @@ def test_logical_file_lrs_inconsistency(): msg = "logical file, but last logical record segment expects successor" assert msg in str(excinfo.value) -def test_padbytes_as_large_as_record(): +def test_padbytes_as_large_as_record(assert_info): path = 'data/chap2/padbytes-large-as-record.dlis' with dlisio.load(path) as (f,): assert len(f.find('.*', '.*')) == 0 assert len(f.fdata_index) == 0 + assert_info("= logical record segment length") def test_padbytes_as_large_as_segment_explicit(): path = 'data/chap2/padbytes-large-as-seg-explict.dlis' From 4630e5377615ed57fdf475fd3fc1d1973c814a20 Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Wed, 18 Nov 2020 16:32:08 +0100 Subject: [PATCH 25/25] Add combined error handler tests Now as we added error handling to all the main methods, we can test how changes to error handler affect results of various operations combined with each other. --- python/data/chap4-7/README.rst | 4 +- .../many-logical-files-error-in-last.dlis | Bin 2048 -> 2048 bytes python/tests/test_error_handler.py | 102 +++++++++++++++++- 3 files changed, 102 insertions(+), 4 deletions(-) diff --git a/python/data/chap4-7/README.rst b/python/data/chap4-7/README.rst index ed6dc81aa..a18a85891 100644 --- a/python/data/chap4-7/README.rst +++ b/python/data/chap4-7/README.rst @@ -164,7 +164,9 @@ invalid-date-in-origin.dlis Simple file which contains inva many-logical-files.dlis Contains several logical files, one without file header -many-logical-files-error-in-last.dlis Contains several logical files, last one is broken +many-logical-files-error-in-last.dlis Contains several logical files, last one is broken. + Two first ones have minor issue (Origin sets are + redundant) many-logical-files-same-object.dlis Contains 2 logical files with the same objects and encrypted records diff --git a/python/data/chap4-7/many-logical-files-error-in-last.dlis b/python/data/chap4-7/many-logical-files-error-in-last.dlis index 90fae836ffc0f44a85c4a018e6d20054136b0095..5512a388f6757574830d470f4e0b0bd1ed039e27 100644 GIT binary patch delta 25 dcmZn=Xb_lig>l2?-K+~4H