diff --git a/lib/extension/dlisio/ext/io.hpp b/lib/extension/dlisio/ext/io.hpp index c919169f6..d65c0b4b7 100644 --- a/lib/extension/dlisio/ext/io.hpp +++ b/lib/extension/dlisio/ext/io.hpp @@ -58,6 +58,7 @@ class stream { struct stream_offsets { std::vector< long long > explicits; std::vector< long long > implicits; + std::vector< long long > broken; }; stream open(const std::string&, std::int64_t) noexcept (false); @@ -68,13 +69,15 @@ long long findsul(stream&) noexcept (false); long long findvrl(stream&, long long) noexcept (false); bool hastapemark(stream&) noexcept (false); -dl::record extract(stream&, long long) noexcept (false); -dl::record& extract(stream&, long long, long long, dl::record&) noexcept (false); +dl::record extract(stream&, long long, dl::error_handler&) noexcept (false); +dl::record& extract(stream&, long long, long long, dl::record&, + dl::error_handler&) noexcept (false); -stream_offsets findoffsets(dl::stream&) noexcept (false); +stream_offsets findoffsets(dl::stream&, dl::error_handler&) noexcept (false); std::map< dl::ident, std::vector< long long > > -findfdata(dl::stream&, const std::vector< long long >&) noexcept (false); +findfdata(dl::stream&, const std::vector< long long >&, dl::error_handler&) +noexcept (false); } diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index 3e90bdbdb..38476d724 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -390,6 +390,40 @@ using value_vector = mpark::variant< std::vector< units > >; +/* + * Assigned error severity. + */ +enum class error_severity { + INFO = 1, // everything seems fine, but situation itself is not typical + MINOR = 2, // contradicts specification, but recovery is most likely ok + MAJOR = 3, // contradicts specification, not sure if recovery is ok + CRITICAL = 4, // broken beyond repair, could not recover +}; + +/* + * Error which is caused by violations with regards to rp66 protocol. + * + * It can be classified by us in different severity levels. In some situations + * we might know that violation is common and have a good idea how to recover. + * In other situations violation might be too severe for us to do anything about + * it. + */ +struct dlis_error { + error_severity severity; + std::string problem; + std::string specification; + std::string action; +}; + +struct error_handler { + virtual void log(const error_severity &level, const std::string &context, + const std::string &problem,const std::string &specification, + const std::string &action) + const noexcept(false) = 0; + + virtual ~error_handler() = default; +}; + struct record { bool isexplicit() const noexcept (true); bool isencrypted() const noexcept (true); @@ -402,6 +436,23 @@ struct record { /* * The structure of an attribute as described in 3.2.2.1 + * + * Error handling: + * + * Due to the intricate structure of a dlis-file, dlisio typically over-parses + * when a certain piece of information is queried. This would typically make + * any warnings or errors issued from the parsing routines pre-mature and might + * result in the query failing due to an error in some (from a + * user-perspective) unrelated data. + * + * To combat this, the result of parsing routines (and the state of the + * parsed object) is communicated through error codes set on the object. + * + * It is the consumers responsibility to check the state of the + * object before using its content. + * + * object_attribute.log contains a list of dl::dlis_error, which provide + * human-readable explanation of the problem */ struct object_attribute { dl::ident label = {}; @@ -412,6 +463,8 @@ struct object_attribute { dl::value_vector value = {}; bool invariant = false; + std::vector< dl::dlis_error > log; + bool operator == (const object_attribute& ) const noexcept (true); }; @@ -471,6 +524,7 @@ struct basic_object { dl::obname object_name; dl::ident type; std::vector< object_attribute > attributes; + std::vector< dl::dlis_error > log; }; /* Object set @@ -502,6 +556,11 @@ struct basic_object { * encrypted records cannot be parsed by dlisio without being decrypted first. * As object_set does its parsing itself, it _will_ fail on construction if * given an encrypted record. + * + * Log: + * + * As well as attributes, every object set should have information about issues + * that arose during parsing. */ using object_vector = std::vector< basic_object >; @@ -513,14 +572,20 @@ struct object_set { dl::ident type; dl::ident name; + std::vector< dl::dlis_error > log; + dl::object_vector& objects() noexcept (false); private: dl::record record; dl::object_vector objs; dl::object_template tmpl; - void parse() noexcept (false); + void parse() noexcept (true); bool parsed = false; + + const char* parse_set_component(const char* cur) noexcept (false); + const char* parse_template(const char* cur) noexcept (false); + const char* parse_objects(const char* cur) noexcept (false); }; struct matcher { @@ -539,27 +604,17 @@ class pool { object_vector get(const std::string& type, const std::string& name, - const dl::matcher& matcher) noexcept (false); + const dl::matcher& matcher, + const error_handler& errorhandler) noexcept (false); object_vector get(const std::string& type, - const dl::matcher& matcher) noexcept (false); + const dl::matcher& matcher, + const error_handler& errorhandler) noexcept (false); private: std::vector< dl::object_set > eflrs; }; -const char* parse_template( const char* begin, - const char* end, - object_template& ) noexcept (false); - - -object_set parse_objects( const char*, const char* ) noexcept (false); -const char* parse_set_component( const char*, - const char*, - ident*, - ident*, - int* ) noexcept (false); - } #endif //DLISIO_EXT_TYPES_HPP diff --git a/lib/src/dlisio.cpp b/lib/src/dlisio.cpp index f4e86f41a..310250338 100644 --- a/lib/src/dlisio.cpp +++ b/lib/src/dlisio.cpp @@ -499,15 +499,14 @@ int dlis_component_set( std::uint8_t desc, int role, int* type, int* name ) { *type = desc & (1 << 4); *name = desc & (1 << 3); - if( !*type ) return DLIS_INCONSISTENT; return DLIS_OK; } int dlis_component_object( std::uint8_t desc, int role, int* obname ) { - if( role != DLIS_ROLE_OBJECT ) return DLIS_UNEXPECTED_VALUE; + if( role != DLIS_ROLE_OBJECT ) + return DLIS_UNEXPECTED_VALUE; *obname = desc & (1 << 4); - if( !*obname ) return DLIS_INCONSISTENT; return DLIS_OK; } diff --git a/lib/src/io.cpp b/lib/src/io.cpp index a4eb787ee..656fe5c87 100644 --- a/lib/src/io.cpp +++ b/lib/src/io.cpp @@ -177,7 +177,8 @@ bool type_consistent( const T& ) noexcept (true) { void trim_segment(std::uint8_t attrs, const char* begin, int segment_size, - std::vector< char >& segment) + std::vector< char >& segment, + dl::error_handler& errorhandler) noexcept (false) { int trim = 0; const auto* end = begin + segment_size; @@ -188,22 +189,27 @@ noexcept (false) { segment.resize(segment.size() - trim); return; - case DLIS_BAD_SIZE: + case DLIS_BAD_SIZE: { if (trim - segment_size != DLIS_LRSH_SIZE) { const auto msg = - "bad segment trim: padbytes (which is {}) " + "bad segment trim: trim size (which is {}) " ">= segment.size() (which is {})"; throw std::runtime_error(fmt::format(msg, trim, segment_size)); } - /* - * padbytes included the segment header. It's larger than - * the segment body, but only because it also counts the - * header. accept that, pretend the body was never added, - * and move on. - */ + errorhandler.log( + dl::error_severity::MINOR, + "extract (trim_segment)", + "trim size (padbytes + checksum + trailing length) = logical " + "record segment length", + "[from 2.2.2.1 Logical Record Segment Header (LRSH) and " + "2.2.2.4 Logical Record Segment Trailer (LRST) situation " + "should be impossible]", + "Segment is skipped"); + segment.resize(segment.size() - segment_size); return; + } default: throw std::invalid_argument("dlis_trim_record_segment"); @@ -299,14 +305,16 @@ template < typename T > using shortvec = std::basic_string< T >; -record extract(stream& file, long long tell) noexcept (false) { +record extract(stream& file, long long tell, error_handler& errorhandler) +noexcept (false) { record rec; rec.data.reserve( 8192 ); auto nbytes = std::numeric_limits< std::int64_t >::max(); - return extract(file, tell, nbytes, rec); + return extract(file, tell, nbytes, rec, errorhandler); } -record& extract(stream& file, long long tell, long long bytes, record& rec) noexcept (false) { +record& extract(stream& file, long long tell, long long bytes, record& rec, + error_handler& errorhandler) noexcept (false) { shortvec< std::uint8_t > attributes; shortvec< int > types; bool consistent = true; @@ -359,7 +367,7 @@ record& extract(stream& file, long long tell, long long bytes, record& rec) noex */ const auto* fst = rec.data.data() + prevsize; - trim_segment(attrs, fst, len, rec.data); + trim_segment(attrs, fst, len, rec.data, errorhandler); /* if the whole segment is getting trimmed, it's unclear if successor * attribute should be erased or not. For now ignoring. Suspecting @@ -388,58 +396,149 @@ record& extract(stream& file, long long tell, long long bytes, record& rec) noex } } -stream_offsets findoffsets( dl::stream& file) noexcept (false) { +stream_offsets findoffsets( dl::stream& file, dl::error_handler& errorhandler) +noexcept (false) { stream_offsets ofs; - std::int64_t offset = 0; + std::int64_t lr_offset = 0; + std::int64_t lrs_offset = 0; + + bool has_successor = false; char buffer[ DLIS_LRSH_SIZE ]; + const auto handle = [&]( const std::string& problem ) { + const auto context = "dl::findoffsets (indexing logical file)"; + errorhandler.log(dl::error_severity::CRITICAL, context, problem, "", + "Indexing is suspended at last valid Logical Record"); + ofs.broken.push_back( lr_offset ); + }; + int len = 0; + auto read = 0; + + file.seek(lrs_offset); + while (true) { - file.seek(offset); - file.read(buffer, DLIS_LRSH_SIZE); - if (file.eof()) + try { + read = file.read(buffer, DLIS_LRSH_SIZE); + } catch (std::exception& e) { + handle(e.what()); break; + } + + // read operation is enough to set eof correctly + if (file.eof()) { + if (read == 0) { + if (has_successor) { + const auto problem = + "Reached EOF, but last logical record segment expects " + "successor"; + handle(problem); + } + break; + } + if (read < 4) { + /* + * Very unlikely situation. + * Usually exception will be thrown during read. + */ + const auto problem = + "File truncated in Logical Record Header"; + handle(problem); + break; + } + /* + * Do nothing if read == 4 + * This might be the problem for next Logical File. + * If not, it will be dealt with later + */ + } int type; std::uint8_t attrs; dlis_lrsh( buffer, &len, &attrs, &type ); if (len < 4) { - auto msg = "Too short logical record. Length can't be less than 4, " - "but was {}"; - throw std::runtime_error(fmt::format(msg, len)); + const auto problem = + "Too short logical record. Length can't be less than 4, " + "but was {}"; + handle(fmt::format(problem, len)); + break; } - int isexplicit = attrs & DLIS_SEGATTR_EXFMTLR; - if (not (attrs & DLIS_SEGATTR_PREDSEG)) { + bool isexplicit = attrs & DLIS_SEGATTR_EXFMTLR; + bool has_predecessor = attrs & DLIS_SEGATTR_PREDSEG; + + if (not (has_predecessor)) { if (isexplicit and type == 0 and ofs.explicits.size()) { /* - * Wrap up when we encounter a EFLR of type FILE-HEADER that is - * NOT the first Logical Record. More precisely we expect the - * _first_ LR we encounter to be a FILE-HEADER. We gather up - * this LR and all successive LR's until we encounter another - * FILE-HEADER. + * Wrap up when we encounter a EFLR of type FILE-HEADER that + * is NOT the first Logical Record. More precisely we expect + * the _first_ LR we encounter to be a FILE-HEADER. We + * gather up this LR and all successive LR's until we + * encounter another FILE-HEADER. */ - file.seek( offset ); + + if (has_successor) { + const auto problem = + "End of logical file, but last logical " + "record segment expects successor"; + handle(problem); + break; + } + + // seek to assure handle is in the right place to read next LF + file.seek( lrs_offset ); break; } - if (isexplicit) ofs.explicits.push_back( offset ); + } + + has_successor = attrs & DLIS_SEGATTR_SUCCSEG; + lrs_offset += len; + + /* + * Skip the segment by moving the cursor to the next offset. + * Seek operation alone isn't enough to correctly set EOF. To make sure + * record is not truncated, read its last byte instead of seeking + * to the new offset. + * + * Note that lfp returns UNEXPECTED_EOF for cfile when truncation + * happens inside of declared data + * TODO: assure behavior for other io types + */ + + char tmp; + file.seek(lrs_offset - 1); + try { + file.read(&tmp, 1); + } catch (const std::runtime_error& e) { + const auto problem = "File truncated in Logical Record Segment"; + handle(problem); + break; + } + + + if (not (has_successor)) { + if (isexplicit) + ofs.explicits.push_back( lr_offset ); /* * Consider doing fdata-indexing on the fly as we are now at the - * correct offset to read the OBNAME. That would mean we only need - * to traverse the file a single time to index it. Additionally it - * would make the caller code from python way nicer. + * correct offset to read the OBNAME. That would mean we only + * need to traverse the file a single time to index it. + * Additionally it would make the caller code from python way + * nicer. */ - else ofs.implicits.push_back( offset ); + else + ofs.implicits.push_back( lr_offset ); + + lr_offset = lrs_offset; } - offset += len; } return ofs; } std::map< dl::ident, std::vector< long long > > -findfdata(dl::stream& file, const std::vector< long long >& tells) -noexcept (false) { +findfdata(dl::stream& file, const std::vector< long long >& tells, +dl::error_handler& errorhandler) noexcept (false) { std::map< dl::ident, std::vector< long long > > xs; constexpr std::size_t OBNAME_SIZE_MAX = 262; @@ -447,8 +546,20 @@ noexcept (false) { record rec; rec.data.reserve( OBNAME_SIZE_MAX ); + const auto handle = [&]( const std::string& problem ) { + const auto context = "dl::findfdata: Indexing implicit records"; + errorhandler.log(dl::error_severity::CRITICAL, context, problem, "", + "Record is skipped"); + }; + for (auto tell : tells) { - extract(file, tell, OBNAME_SIZE_MAX, rec); + try { + extract(file, tell, OBNAME_SIZE_MAX, rec, errorhandler); + } catch (std::exception& e) { + handle(e.what()); + continue; + } + if (rec.isencrypted()) continue; if (rec.type != 0) continue; if (rec.data.size() == 0) continue; @@ -461,8 +572,10 @@ noexcept (false) { std::size_t obname_size = cur - rec.data.data(); if (obname_size > rec.data.size()) { - auto msg = "File corrupted. Error on reading fdata obname"; - throw std::runtime_error(msg); + const auto problem = + "fdata record corrupted, error on reading obname"; + handle(problem); + continue; } dl::obname tmp{ dl::origin{ origin }, dl::ushort{ copy }, diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index 2d16bac95..c61bfa156 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -12,14 +12,6 @@ namespace { -void user_warning( const std::string& ) noexcept (true) { - // TODO: -} - -void debug_warning( const std::string& ) noexcept (true) { - // TODO: -} - struct set_descriptor { int role; bool type; @@ -33,45 +25,27 @@ set_descriptor parse_set_descriptor( const char* cur ) noexcept (false) { set_descriptor flags; dlis_component( attr, &flags.role ); - switch (flags.role) { - case DLIS_ROLE_RDSET: - case DLIS_ROLE_RSET: - case DLIS_ROLE_SET: - break; - - default: { - const auto bits = std::bitset< 8 >{ attr }.to_string(); - const auto role = dlis_component_str(flags.role); - const auto msg = "error parsing object set descriptor: " - "expected SET, RSET or RDSET, was {} ({})" - ; - throw std::invalid_argument(fmt::format(msg, role, bits)); - } - } - int type, name; const auto err = dlis_component_set( attr, flags.role, &type, &name ); - flags.type = type; - flags.name = name; switch (err) { case DLIS_OK: break; - - case DLIS_INCONSISTENT: - /* - * 3.2.2.2 Component usage - * The Set Component contains the Set Type, which is not optional - * and must not be null, and the Set Name, which is optional. - */ - user_warning( "SET:type not set, but must be non-null." ); - flags.type = true; - break; - + case DLIS_UNEXPECTED_VALUE: { + const auto bits = std::bitset< 8 >{ attr }.to_string(); + const auto role = dlis_component_str(flags.role); + const auto msg = "error parsing object set descriptor: " + "expected SET, RSET or RDSET, was {} ({})" + ; + throw std::invalid_argument(fmt::format(msg, role, bits)); + } default: throw std::runtime_error("unhandled error in dlis_component_set"); } + flags.type = type; + flags.name = name; + return flags; } @@ -107,18 +81,10 @@ attribute_descriptor parse_attribute_descriptor( const char* cur ) { case DLIS_ROLE_INVATR: flags.invariant = true; - - case DLIS_ROLE_ATTRIB: break; - default: { - const auto bits = std::bitset< 8 >(role).to_string(); - const auto was = dlis_component_str(role); - const auto msg = "error parsing attribute descriptor: " - "expected ATTRIB, INVATR, ABSATR or OBJECT, was {} ({})" - ; - throw std::invalid_argument(fmt::format(msg, was, bits)); - } + default: + break; } if (flags.object || flags.absent) return flags; @@ -129,17 +95,30 @@ attribute_descriptor parse_attribute_descriptor( const char* cur ) { &reprc, &units, &value ); + + switch (err) { + case DLIS_OK: + break; + case DLIS_UNEXPECTED_VALUE: { + const auto bits = std::bitset< 8 >(role).to_string(); + const auto was = dlis_component_str(role); + const auto msg = "error parsing attribute descriptor: " + "expected ATTRIB, INVATR, ABSATR or OBJECT, " + "was {} ({})"; + throw std::invalid_argument(fmt::format(msg, was, bits)); + } + default: + throw std::runtime_error( "unhandled error in " + "dlis_component_attrib" ); + } + flags.label = label; flags.count = count; flags.reprc = reprc; flags.units = units; flags.value = value; - if (!err) return flags; - - // all sources for this error should've been checked, so - // something is REALLY wrong if we end up here - throw std::runtime_error( "unhandled error in dlis_component_attrib" ); + return flags; } struct object_descriptor { @@ -153,20 +132,29 @@ object_descriptor parse_object_descriptor( const char* cur ) { int role; dlis_component( attr, &role ); - if (role != DLIS_ROLE_OBJECT) { - const auto bits = std::bitset< 8 >{ attr }.to_string(); - const auto was = dlis_component_str(role); - const auto msg = "error parsing object descriptor: " - "expected OBJECT, was {} ({})" - ; - throw std::invalid_argument(fmt::format(msg, was, bits)); - } - int name; const auto err = dlis_component_object( attr, role, &name ); - if (err) user_warning( "OBJECT:name was not set, but must be non-null" ); - return { true }; + switch (err) { + case DLIS_OK: + break; + case DLIS_UNEXPECTED_VALUE: { + const auto bits = std::bitset< 8 >{ attr }.to_string(); + const auto was = dlis_component_str(role); + const auto msg = "error parsing object descriptor: " + "expected OBJECT, was {} ({})" + ; + throw std::invalid_argument(fmt::format(msg, was, bits)); + } + default: + throw std::runtime_error("unhandled error in " + "dlis_component_object"); + } + + object_descriptor flags; + flags.name = name; + + return flags; } using std::swap; @@ -454,14 +442,37 @@ const char* cast( const char* xs, xs = cast( xs, x ); if (x < DLIS_FSHORT || x > DLIS_UNITS) { - debug_warning("Read incorrect representation code"); reprc = dl::representation_code::undef; - }else{ + } else { reprc = static_cast< dl::representation_code >( x ); } return xs; } +const char* repcode(const char* xs, dl::object_attribute& attr ) +noexcept (false) { + dl::representation_code& reprc = attr.reprc; + + auto cur = cast(xs, reprc); + if (reprc == dl::representation_code::undef) { + // retrieve value again for reporting, as it is lost + dl::ushort x{ 0 }; + cast( xs, x ); + + const auto msg = "Invalid representation code {}"; + const auto code = static_cast< int >(x); + dl::dlis_error err { + dl::error_severity::MINOR, + fmt::format(msg, code), + "Appendix B: Representation Codes", + "Continue. Postpone dealing with this until later" + }; + attr.log.push_back(err); + } + + return cur; +} + template < typename T > const char* extract( std::vector< T >& vec, std::int32_t count, @@ -483,12 +494,10 @@ std::vector< T >& reset( dl::value_vector& value ) noexcept (false) { return value.emplace< std::vector< T > >(); } -const char* elements( const char* xs, - dl::uvari count, - dl::representation_code reprc, - dl::value_vector& vec ) { - - const auto n = dl::decay( count ); +const char* elements( const char* xs, dl::object_attribute& attr ) { + const auto reprc = attr.reprc; + dl::value_vector& vec = attr.value; + const auto n = dl::decay( attr.count ); if (n == 0) { vec = mpark::monostate{}; @@ -682,10 +691,8 @@ bool basic_object::operator != (const basic_object& o) const noexcept (true) { } -const char* parse_template( const char* cur, - const char* end, - object_template& out ) noexcept (false) { - object_template tmp; +const char* object_set::parse_template(const char* cur) noexcept (false) { + const char* end = this->record.data.data() + this->record.data.size(); while (true) { if (cur >= end) @@ -693,7 +700,6 @@ const char* parse_template( const char* cur, const auto flags = parse_attribute_descriptor( cur ); if (flags.object) { - swap( tmp, out ); return cur; } @@ -701,38 +707,49 @@ const char* parse_template( const char* cur, cur += DLIS_DESCRIPTOR_SIZE; if (flags.absent) { - user_warning( "ABSATR in object template - skipping" ); + dlis_error err { + dl::error_severity::MAJOR, + "Absent Attribute in object set template", + "3.2.2.2 Component Usage: A Template consists of a collection " + "of Attribute Components and/or Invariant Attribute " + "Components, mixed in any fashion.", + "Attribute not included in template" + }; + this->log.push_back(err); continue; } object_attribute attr; if (!flags.label) { - /* - * 3.2.2.2 Component usage - * All Components in the Template must have distinct, non-null - * Labels. - * - * Assume that if this isn't set properly it's a corrupted - * descriptor, so just try to read the label anyway - */ - user_warning( "Label not set, but must be non-null" ); + dlis_error err { + dl::error_severity::MAJOR, + "Label not set in template", + "3.2.2.2 Component Usage: All Components in the Template must " + "have distinct, non-null Labels.", + "Assumed attribute descriptor corrupted, attempt to read " + "label anyway" + }; + this->log.push_back(err); } cur = cast( cur, attr.label ); if (flags.count) cur = cast( cur, attr.count ); - if (flags.reprc) cur = cast( cur, attr.reprc ); + if (flags.reprc) cur = repcode( cur, attr ); if (flags.units) cur = cast( cur, attr.units ); - if (flags.value) cur = elements( cur, attr.count, - attr.reprc, - attr.value ); + if (flags.value) cur = elements( cur, attr ); attr.invariant = flags.invariant; - tmp.push_back( std::move( attr ) ); + this->tmpl.push_back( std::move( attr ) ); if (cur == end){ - debug_warning("Set contains no objects"); - swap( tmp, out ); + dlis_error err { + dl::error_severity::INFO, + "Set contains no objects", + "3.2.2.2 Component Usage: A Set consists of one or more Objects", + "Leave the set empty and return" + }; + this->log.push_back(err); return cur; } } @@ -774,11 +791,13 @@ struct shrink { } }; -void patch_missing_value( dl::value_vector& value, - std::size_t count, - dl::representation_code reprc ) +void patch_missing_value( dl::object_attribute& attr ) noexcept (false) { + const dl::representation_code reprc = attr.reprc; + const std::size_t count = dl::decay( attr.count ); + dl::value_vector& value = attr.value; + /* * value is *NOT* monostate, i.e. there is a default value. if count != * values.size(), resize it. @@ -790,19 +809,41 @@ noexcept (false) /* smaller, shrink and all is fine */ if (size > count) { + + const auto msg = + "template value is not overridden by object attribute, but " + "count is. count ({}) < template count ({})"; + mpark::visit( shrink( count ), value ); + dlis_error err { + dl::error_severity::MAJOR, + fmt::format(msg, count, size), + "3.2.2.1 Component Descriptor: The number of Elements that " + "make up the Value is specified by the Count " + "Characteristic.", + "shrank template value to new attribute count" + }; + attr.log.push_back(err); return; } /* - * count is larger, so insert default values, maybe? for now, throw - * exception and consider what to do when a file actually uses this - * behaviour + * count is larger, which makes little sense. Likely file is already + * spoiled, but mark attribute as errored and attempt to continue */ - const auto msg = "object attribute without no explicit value, but " - "count (which is {}) > size (which is {})" - ; - throw dl::not_implemented(fmt::format(msg, count, size)); + const auto msg = + "template value is not overridden by object attribute, but " + "count is. count ({}) > template count ({})"; + + dlis_error err { + dl::error_severity::CRITICAL, + fmt::format(msg, count, size), + "3.2.2.1 Component Descriptor: The number of Elements that " + "make up the Value is specified by the Count " + "Characteristic.", + "value is left as in template" + }; + attr.log.push_back(err); } /* @@ -844,24 +885,57 @@ noexcept (false) case rpc::status: reset< dl::status >(value).resize(count); return; case rpc::units: reset< dl::units >(value).resize(count); return; default: { - const auto msg = "unable to patch attribute with no value: " - "unknown representation code {}"; + // repcode is incorrect, but value is missing + // hence we can log an error but continue processing + const auto msg = + "invalid representation code {}"; + /* TODO: there is a problem with reporting. If representation code + * is undefined, we define the value to be 66, not the actual value + * that is present in the file + */ const auto code = static_cast< int >(reprc); - throw std::runtime_error(fmt::format(msg, code)); + dl::dlis_error err { + dl::error_severity::CRITICAL, + fmt::format(msg, code), + "Appendix B: Representation Codes", + "attribute value is left as template default. Continue" + }; + attr.log.push_back(err); + } } } -object_vector parse_objects( const object_template& tmpl, - const dl::ident type, - const char* cur, - const char* end ) noexcept (false) { +bool is_log_clear( const std::vector< dlis_error >& log ) noexcept (true) { + for (const auto& entry : log) { + if (entry.severity >= dl::error_severity::MINOR) + return false; + } + return true; +} - object_vector objs; +void report_set_errors(const dl::object_set& eflr, + const dl::error_handler& errorhandler) { + if (eflr.log.size()) { + const auto context = "object set of type '" + + dl::decay(eflr.type) + "' named '" + + dl::decay(eflr.name) +"'"; + for (const auto &err : eflr.log) { + errorhandler.log(err.severity, context, err.problem, + err.specification, err.action); + } + } +} + +} + +const char* object_set::parse_objects(const char* cur) noexcept (false) { + + const char* end = this->record.data.data() + this->record.data.size(); const auto default_object = defaulted_object( tmpl ); - while (true) { - if (std::distance( cur, end ) <= 0) + while (cur != end) { + if (std::distance( cur, end ) < 0) throw std::out_of_range( "unexpected end-of-record" ); auto object_flags = parse_object_descriptor( cur ); @@ -869,7 +943,21 @@ object_vector parse_objects( const object_template& tmpl, auto current = default_object; current.type = type; - if (object_flags.name) cur = cast( cur, current.object_name ); + bool object_clear = true; + + if (!object_flags.name) { + dlis_error err { + dl::error_severity::MAJOR, + "OBJECT:name was not set", + "3.2.2.1 Component Descriptor: That is, every Object has " + "a non-null Name", + "Assumed object descriptor corrupted, attempt to read name " + "anyway" + }; + current.log.push_back(err); + } + + cur = cast( cur, current.object_name ); for (const auto& template_attr : tmpl) { if (template_attr.invariant) continue; @@ -893,28 +981,31 @@ object_vector parse_objects( const object_template& tmpl, } if (flags.invariant) { - /* - * 3.2.2.2 Component Usage - * Invariant Attribute Components, which may only appear in - * the Template [...] - * - * Assume this is a mistake, assume it was a regular - * non-invariant attribute - */ - user_warning("ATTRIB:invariant in attribute, " - "but should only be in template"); + dlis_error err { + dl::error_severity::MAJOR, + "Invariant attribute in object attributes", + "3.2.2.2 Component Usage: Invariant Attribute Components, " + "which may only appear in the Template [...]", + "ignored invariant bit, assumed that attribute followed" + }; + attr.log.push_back(err); } if (flags.label) { - user_warning( "ATTRIB:label set, but must be null"); + dlis_error err { + dl::error_severity::MAJOR, + "Label bit set in object attribute", + "3.2.2.2 Component Usage: Attribute Components that follow " + "Object Components must not have Attribute Labels", + "ignored label bit, assumed that label never followed" + }; + attr.log.push_back(err); } if (flags.count) cur = cast( cur, attr.count ); - if (flags.reprc) cur = cast( cur, attr.reprc ); + if (flags.reprc) cur = repcode( cur, attr ); if (flags.units) cur = cast( cur, attr.units ); - if (flags.value) cur = elements( cur, attr.count, - attr.reprc, - attr.value ); + if (flags.value) cur = elements( cur, attr ); const auto count = dl::decay( attr.count ); @@ -944,31 +1035,43 @@ object_vector parse_objects( const object_template& tmpl, const auto msg = "count ({}) isn't 0 and representation " "code ({}) changed, but value is not explicitly set"; const auto code = static_cast< int >(attr.reprc); - user_warning(fmt::format(msg, count, code)); + dlis_error err { + dl::error_severity::MAJOR, + fmt::format(msg, count, code), + "", + "value defaulted based on representation code from " + "attribute" + }; + attr.log.push_back(err); attr.value = mpark::monostate{}; } - patch_missing_value( attr.value, count, attr.reprc ); + patch_missing_value( attr ); } + object_clear = object_clear && is_log_clear(attr.log); + current.set(attr); } - objs.push_back( std::move( current ) ); + if (not object_clear) { + const auto msg = + "One or more attributes of this object violate specification. " + "This can potentially corrupt the entire object"; + dlis_error err{ + dl::error_severity::MINOR, msg, "", ""}; + current.log.push_back(err); + } - if (cur == end) break; + this->objs.push_back( std::move( current ) ); } - return objs; + return cur; } -} +const char* object_set::parse_set_component(const char* cur) noexcept (false) { -const char* parse_set_component( const char* cur, - const char* end, - dl::ident* type, - dl::ident* name, - int* role) { + const char* end = this->record.data.data() + this->record.data.size(); if (std::distance( cur, end ) <= 0) throw std::out_of_range( "eflr must be non-empty" ); @@ -980,6 +1083,36 @@ const char* parse_set_component( const char* cur, throw std::out_of_range( msg ); } + switch (flags.role) { + case DLIS_ROLE_RDSET: { + dlis_error err { + dl::error_severity::MINOR, + "Redundant sets are not supported by dlisio", + "3.2.2.2 Component Usage: A Redundant Set is an identical copy " + "of some Set written previously in the same Logical File", + "Redundant set is treated as a normal set, which might lead " + "to issues with duplicated objects" + }; + this->log.push_back(err); + break; + } + case DLIS_ROLE_RSET: { + dlis_error err { + dl::error_severity::MAJOR, + "Replacement sets are not supported by dlisio", + "3.2.2.2 Component Usage: Attributes of the Replacement Set " + "reflect all updates that may have been applied since the " + "original Set was written", + "Replacement set is treated as a normal set, which might lead " + "to issues with duplicated objects and invalid information" + }; + this->log.push_back(err); + break; + } + default: + break; + } + /* * TODO: check for every read that inside [begin,end)? */ @@ -988,43 +1121,57 @@ const char* parse_set_component( const char* cur, dl::ident tmp_type; dl::ident tmp_name; - if (flags.type) cur = cast( cur, tmp_type ); + if (!flags.type) { + dlis_error err { + dl::error_severity::MAJOR, + "SET:type not set", + "3.2.2.1 Component Descriptor: A Set’s Type Characteristic must " + "be non-null and must always be explicitly present in " + "the Set Component", + "Assumed set descriptor corrupted, attempt to read type anyway" + }; + this->log.push_back(err); + } + + cur = cast( cur, tmp_type ); if (flags.name) cur = cast( cur, tmp_name ); - if (type) *type = tmp_type; - if (name) *name = tmp_name; - if (role) *role = tmp_role; + this->type = tmp_type; + this->name = tmp_name; + this->role = tmp_role; return cur; } object_set::object_set(dl::record rec) noexcept (false) { - parse_set_component(rec.data.data(), - rec.data.data() + rec.data.size(), - &this->type, - &this->name, - &this->role); this->record = std::move(rec); + parse_set_component(this->record.data.data()); } -void object_set::parse() noexcept (false) { +void object_set::parse() noexcept (true) { if (this->parsed) return; - const char* beg = this->record.data.data(); - const char* end = beg + this->record.data.size(); - - /* Skip past the set component as it's already been read and parsed */ - auto cur = parse_set_component(beg, end, nullptr, nullptr, nullptr); - - object_template tmpl; - cur = parse_template(cur, end, tmpl); - - //TODO parse_object should return empty list when there are no objects - if (std::distance( cur, end ) > 0) { - auto objs = parse_objects(tmpl, this->type, cur, end); - this->objs = objs; + const char* cur = this->record.data.data(); + + try { + /* As cursor value is not stored, read data again to get the position */ + cur = parse_set_component(cur); + cur = parse_template(cur); + parse_objects(cur); + } catch (const std::exception& e) { + dlis_error err { + dl::error_severity::CRITICAL, + e.what(), + "", + "object set parse has been interrupted" + }; + this->log.push_back(err); } + // TODO: possible assert that cur == end of data - this->tmpl = tmpl; + /* If set is parsed in default mode, exception will be thrown and set won't + * be considered parsed. If set is parsed after that in error-escape mode, + * parsing will be locked, but error will get stored on object set anyway. + */ this->parsed = true; } @@ -1043,7 +1190,8 @@ std::vector< dl::ident > pool::types() const noexcept (true) { object_vector pool::get(const std::string& type, const std::string& name, - const dl::matcher& m) + const dl::matcher& m, + const error_handler& errorhandler) noexcept (false) { object_vector objs; @@ -1055,12 +1203,15 @@ noexcept (false) { objs.push_back(obj); } + + report_set_errors (eflr, errorhandler); } return objs; } object_vector pool::get(const std::string& type, - const dl::matcher& m) + const dl::matcher& m, + const error_handler& errorhandler) noexcept (false) { object_vector objs; @@ -1069,6 +1220,8 @@ noexcept (false) { auto tmp = eflr.objects(); objs.insert(objs.end(), tmp.begin(), tmp.end()); + + report_set_errors (eflr, errorhandler); } return objs; } diff --git a/lib/test/protocol.cpp b/lib/test/protocol.cpp index ca9fda3b3..2de765a3d 100644 --- a/lib/test/protocol.cpp +++ b/lib/test/protocol.cpp @@ -203,21 +203,21 @@ TEST_CASE("Set descriptors", "[component][v1]") { err = dlis_component_set( N, role, &type, &name ); CHECK( !type ); CHECK( name ); - CHECK( err == DLIS_INCONSISTENT ); + CHECK( err == DLIS_OK ); } SECTION("SET: ") { err = dlis_component_set( Z, role, &type, &name ); CHECK( !type ); CHECK( !name ); - CHECK( err == DLIS_INCONSISTENT ); + CHECK( err == DLIS_OK ); } SECTION("SET: reserved values") { err = dlis_component_set( R, DLIS_ROLE_RDSET, &type, &name ); CHECK( !type ); CHECK( !name ); - CHECK( err == DLIS_INCONSISTENT ); + CHECK( err == DLIS_OK ); } } @@ -244,7 +244,7 @@ TEST_CASE("Object descriptors", "[component][v1]") { SECTION("Object: ") { err = dlis_component_object( O, role, &name ); - CHECK( err == DLIS_INCONSISTENT ); + CHECK( err == DLIS_OK ); CHECK( !name ); } diff --git a/python/data/chap2/README.rst b/python/data/chap2/README.rst index b94e01884..a030c8f74 100644 --- a/python/data/chap2/README.rst +++ b/python/data/chap2/README.rst @@ -30,13 +30,21 @@ incomplete-sul.dlis Incomplete storage unit label incomplete-vr.dlis Incomplete visible record + +lf-lrs-inconsistency.dlis First implicit lrs expects successor, but + what follows is a new Logical File + +lf-truncated-after-lrsh.dlis First LF is ok, second is truncated after + File Header lrs + nondlis.txt Not a dlis file old-vr.dlis Older visible record, see 2.3.6.2 Format Version of rp66v1 padbytes-bad.dlis Mismatch between padbyte length and logical - record length + record length. Second LR in the file is + correct. padbytes-encrypted.dlis Explicitly formatted logical record with encrypted padbytes @@ -59,16 +67,28 @@ small.dlis Test file smaller than 4kB too-small-record.dlis Visible record is smaller than the minimum requirement (20Bytes) +truncated-after-lrsh.dlis File is truncated after LRSH which expects + successor + +truncated-in-iflr.dlis Truncated one byte before expected end of + long IFLR + truncated-in-lrsh.dlis File is truncated in the LRSH -truncated-on-full-lr.dlis File is truncated on a complete LR, but not - VR +truncated-in-lrsh-vr-over.dlis File is truncated in the LRSH, VR end aligns + with EOF. Extremely unlikely situation. truncated-in-second-lr.dlis Mismatch between visible record length and remaining bytes. Second LR truncated -truncated-on-lrs.dlis LRS expects successor, but none comes. File - is truncated +truncated-lr-no-lrs-in-vr.dlis LRS expects successor, but none comes. File + is truncated. VR is also truncated + +truncated-lr-no-lrs-vr-over.dlis LRS expects successor, but none comes. File + is truncated. LRS and VR ends align + +truncated-on-full-lr.dlis File is truncated on a complete LR, but not + VR wrong-lrhs.dlis Mismatch between logical record segment length and remaining bytes in visible @@ -76,7 +96,9 @@ wrong-lrhs.dlis Mismatch between logical record segment length) zeroed-in-1st-lr.dlis VR is expected to contain over 1 LR. Data is - zeroed from middle of 1st LR + zeroed from middle of 1st LR. There are 512 + zeroes (more than default 200 bytes + findvrl value) zeroed-in-2nd-lr.dlis VR contains 2 LRs. Data is zeroed in the middle of 2nd LR diff --git a/python/data/chap2/lf-lrs-inconsistency.dlis b/python/data/chap2/lf-lrs-inconsistency.dlis new file mode 100644 index 000000000..742ff8717 Binary files /dev/null and b/python/data/chap2/lf-lrs-inconsistency.dlis differ diff --git a/python/data/chap2/lf-truncated-after-lrsh.dlis b/python/data/chap2/lf-truncated-after-lrsh.dlis new file mode 100644 index 000000000..5b940144c Binary files /dev/null and b/python/data/chap2/lf-truncated-after-lrsh.dlis differ diff --git a/python/data/chap2/padbytes-bad.dlis b/python/data/chap2/padbytes-bad.dlis index 17d370099..190853756 100644 Binary files a/python/data/chap2/padbytes-bad.dlis and b/python/data/chap2/padbytes-bad.dlis differ diff --git a/python/data/chap2/truncated-after-lrsh.dlis b/python/data/chap2/truncated-after-lrsh.dlis new file mode 100644 index 000000000..1df737ab9 Binary files /dev/null and b/python/data/chap2/truncated-after-lrsh.dlis differ diff --git a/python/data/chap2/truncated-in-iflr.dlis b/python/data/chap2/truncated-in-iflr.dlis new file mode 100644 index 000000000..62fd2e96c Binary files /dev/null and b/python/data/chap2/truncated-in-iflr.dlis differ diff --git a/python/data/chap2/truncated-in-lrsh-vr-over.dlis b/python/data/chap2/truncated-in-lrsh-vr-over.dlis new file mode 100644 index 000000000..957a27b7b Binary files /dev/null and b/python/data/chap2/truncated-in-lrsh-vr-over.dlis differ diff --git a/python/data/chap2/truncated-on-lrs.dlis b/python/data/chap2/truncated-lr-no-lrs-in-vr.dlis similarity index 100% rename from python/data/chap2/truncated-on-lrs.dlis rename to python/data/chap2/truncated-lr-no-lrs-in-vr.dlis diff --git a/python/data/chap2/truncated-lr-no-lrs-vr-over.dlis b/python/data/chap2/truncated-lr-no-lrs-vr-over.dlis new file mode 100644 index 000000000..442e9f8e8 Binary files /dev/null and b/python/data/chap2/truncated-lr-no-lrs-vr-over.dlis differ diff --git a/python/data/chap2/zeroed-in-1st-lr.dlis b/python/data/chap2/zeroed-in-1st-lr.dlis index d403f48f1..8add87bdf 100644 Binary files a/python/data/chap2/zeroed-in-1st-lr.dlis and b/python/data/chap2/zeroed-in-1st-lr.dlis differ diff --git a/python/data/chap3/README.rst b/python/data/chap3/README.rst index 619efeb8f..36d203944 100644 --- a/python/data/chap3/README.rst +++ b/python/data/chap3/README.rst @@ -25,8 +25,9 @@ start.dlis.part SUL + all the VR and LRSH bytesiBytes 50 and 51 ====================================== ======================================== Files in /implicit Description ====================================== ======================================== -fdata-broken-obname.dlis File is truncated in the obname of fdata - record +fdata-broken-obname.dlis Obname length in first implicit record is + bigger than remaining record length. + Second implicit record is correct fdata-encrypted.dlis Fdata record is encrypted @@ -57,6 +58,17 @@ fdata-vr-obname-trailing.dlis Fdata obname is interrupted by Visible ====================================== ======================================== +*explicit* folder contains full files. + +====================================== ======================================== +Files in /explicit Description +====================================== ======================================== +broken-in-set.dlis First EFLR is correct, second EFLR is + broken in set component (not a set), + third EFLR is correct +====================================== ======================================== + + Creating a valid .dlis-file --------------------------- diff --git a/python/data/chap3/explicit/broken-in-set.dlis b/python/data/chap3/explicit/broken-in-set.dlis new file mode 100644 index 000000000..a1502f83a Binary files /dev/null and b/python/data/chap3/explicit/broken-in-set.dlis differ diff --git a/python/data/chap3/implicit/fdata-broken-obname.dlis b/python/data/chap3/implicit/fdata-broken-obname.dlis index 917634792..4e1638b80 100644 Binary files a/python/data/chap3/implicit/fdata-broken-obname.dlis and b/python/data/chap3/implicit/fdata-broken-obname.dlis differ diff --git a/python/data/chap3/objattr/reprcode-invalid-no-value.dlis.part b/python/data/chap3/objattr/reprcode-invalid-no-value.dlis.part new file mode 100644 index 000000000..6613a6877 --- /dev/null +++ b/python/data/chap3/objattr/reprcode-invalid-no-value.dlis.part @@ -0,0 +1 @@ +$ \ No newline at end of file diff --git a/python/data/chap3/objattr/reprcode-invalid.dlis.part b/python/data/chap3/objattr/reprcode-invalid-value.dlis.part similarity index 100% rename from python/data/chap3/objattr/reprcode-invalid.dlis.part rename to python/data/chap3/objattr/reprcode-invalid-value.dlis.part diff --git a/python/data/chap3/object/object2.dlis.part b/python/data/chap3/object/object2.dlis.part new file mode 100644 index 000000000..90011b0e3 --- /dev/null +++ b/python/data/chap3/object/object2.dlis.part @@ -0,0 +1 @@ +pOBJECT2 \ No newline at end of file diff --git a/python/data/chap3/set/redundant.dlis.part b/python/data/chap3/set/redundant.dlis.part new file mode 100644 index 000000000..4e881f166 Binary files /dev/null and b/python/data/chap3/set/redundant.dlis.part differ diff --git a/python/data/chap3/set/replacement.dlis.part b/python/data/chap3/set/replacement.dlis.part new file mode 100644 index 000000000..7b0a210f7 Binary files /dev/null and b/python/data/chap3/set/replacement.dlis.part differ diff --git a/python/data/chap3/template/attribute2.part b/python/data/chap3/template/attribute2.part new file mode 100644 index 000000000..69ab2f71b Binary files /dev/null and b/python/data/chap3/template/attribute2.part differ diff --git a/python/data/chap4-7/README.rst b/python/data/chap4-7/README.rst index b4575316a..a18a85891 100644 --- a/python/data/chap4-7/README.rst +++ b/python/data/chap4-7/README.rst @@ -93,6 +93,10 @@ broken-ascii.dlis Ascii with incorrect specified characters are expected, but only 20 are actually present +broken-fmt.dlis There is less data in first fdata record than it + should be according to fmt string. Second IFLR is + correct. + broken-utf8-ascii.dlis Ascii containing non-utf8 characters duplicate-framenos.dlis 2 FDATA containing different values, but they have @@ -160,7 +164,9 @@ invalid-date-in-origin.dlis Simple file which contains inva many-logical-files.dlis Contains several logical files, one without file header -many-logical-files-error-in-last.dlis Contains several logical files, last one is broken +many-logical-files-error-in-last.dlis Contains several logical files, last one is broken. + Two first ones have minor issue (Origin sets are + redundant) many-logical-files-same-object.dlis Contains 2 logical files with the same objects and encrypted records diff --git a/python/data/chap4-7/iflr/broken-fmt.dlis b/python/data/chap4-7/iflr/broken-fmt.dlis new file mode 100644 index 000000000..26312ba78 Binary files /dev/null and b/python/data/chap4-7/iflr/broken-fmt.dlis differ diff --git a/python/data/chap4-7/many-logical-files-error-in-last.dlis b/python/data/chap4-7/many-logical-files-error-in-last.dlis index 90fae836f..5512a388f 100644 Binary files a/python/data/chap4-7/many-logical-files-error-in-last.dlis and b/python/data/chap4-7/many-logical-files-error-in-last.dlis differ diff --git a/python/dlisio/__init__.py b/python/dlisio/__init__.py index 340c68247..a0ecd9e2b 100644 --- a/python/dlisio/__init__.py +++ b/python/dlisio/__init__.py @@ -1,5 +1,6 @@ from . import core from . import plumbing +from . import errors from .settings import get_encodings, set_encodings from .file import physicalfile, logicalfile from .load import open, load diff --git a/python/dlisio/dlisutils.py b/python/dlisio/dlisutils.py index 97cbbeaa2..dd668fe02 100644 --- a/python/dlisio/dlisutils.py +++ b/python/dlisio/dlisutils.py @@ -24,4 +24,5 @@ def curves(dlis, frame, dtype, pre_fmt, fmt, post_fmt): indices, dtype.itemsize, alloc, + dlis.error_handler ) diff --git a/python/dlisio/errors/__init__.py b/python/dlisio/errors/__init__.py new file mode 100644 index 000000000..1ee5b32c0 --- /dev/null +++ b/python/dlisio/errors/__init__.py @@ -0,0 +1 @@ +from .errorhandler import * diff --git a/python/dlisio/errors/errorhandler.py b/python/dlisio/errors/errorhandler.py new file mode 100644 index 000000000..682ab0b0c --- /dev/null +++ b/python/dlisio/errors/errorhandler.py @@ -0,0 +1,206 @@ +import logging +from enum import Enum + +from dlisio import core + +class Actions(Enum): + """ + Actions available for various specification violations + """ + LOG_DEBUG = lambda msg: logging.debug(msg) + """logging.debug""" + LOG_INFO = lambda msg: logging.info(msg) + """logging.info""" + LOG_WARNING = lambda msg: logging.warning(msg) + """logging.warning""" + LOG_ERROR = lambda msg: logging.error(msg) + """logging.error""" + RAISE = lambda msg: ErrorHandler.raise_msg(msg) + """raise RuntimeError""" + SWALLOW = lambda msg: ErrorHandler.swallow(msg) + """pass""" + + def __init__(self, handle): + self.handle = handle + +class ErrorHandler(core.error_handler): + """ Defines rules about error handling + + Many .dlis files happen to be not compliant with specification or simply + broken. This class gives user some control over handling of such files. + + When dlisio encounters a specification violation, it categories the issue + based on the severity of the violation. Some issues are easy to ignore + while other might force dlisio to give up on its current task. + ErrorHandler supplies an interface for changing how dlisio reacts to + different violation in the file. + + Different categories are *info*, *minor*, *major* and *critical*: + + ======== =================================================================== + Severity Description + ======== =================================================================== + critical Any issue that forces dlisio stop its current objective prematurely + is categorised as critical. + + By default a critical error raises a RuntimeError. + + An example would be file indexing, which happens at load. Suppose + the indexing fails midways through the file. There is no way for + dlisio to reliably keep indexing the file. + However, it is likely that the file is readable up until the point + of failure. Changing the behaviour of critical from raising an + Exception to logging would in this case mean that a partially + indexed file is returned by load. + major Result of a direct specification violation in the file. dlisio + makes an assumption about what broken information [1] should have + been and continues parsing the file on this assumption. + If no other major or critical issues are reported, it's likely that + assumption was correct and that dlisio parsed the file correctly. + However, no guarantees can be made. + + By default a warning is logged. + + [1] Note that "information" in this case refers to the data in the + file that tells dlisio how the file should be parsed, not to the + actual parsed data. + + minor Like Major issues, this is also a result of a direct specification + violation. dlisio makes similar assumptions to keep parsing the + file. Minor issues are generally less severe and, in contrast to + major issues, are more likely to be handled correctly. + However, still no guarantees can be made about the parsed data. + + By default an info message is logged. + info Issue doesn't contradict specification, but situation is peculiar. + + By default a debug message is logged. + + ======== =================================================================== + + ErrorHandler only applies to issues related to parsing information from the + file. These are issues that otherwise would force dlisio to fail, such as + direct violations of the RP66v1 specification. + It does not apply to inconsistencies and issues in the parsed data. + This means that cases where dlisio enforces behaviour of the parsed data, + such as object-to-object references, are out of scope for the ErrorHandler. + + Please also note that ErrorHandler doesn't redefine issues categories, it + only changes default behavior. + + Attributes + ---------- + info: + Action for merely information message + + minor: + Action for minor specification violation + + major: + Action for major specification violation + + critical: + Action for critical specification violation + + + Warnings + -------- + Escaping errors is a good solution when user needs to read as much data as + possible, for example, to have a general overview over the file. However + user must be careful when using this mode during close inspection. + If user decides to accept errors, they must be aware that some returned data + will be spoiled. Most likely it will be data which is stored in the file + near the failure. + + Warnings + -------- + Be careful not to ignore too much information when investigating files. + If you want to debug a broken part of the file, you should look at all + issues to get a full picture of the situation. + + Examples + -------- + Define your own rules: + + >>> from dlisio.errors import ErrorHandler, Actions + >>> def myhandler(msg): + ... raise RuntimeError("Custom handler: " + msg) + >>> errorhandler = ErrorHandler( + ... info = Actions.SWALLOW, + ... minor = Actions.LOG_WARNING, + ... major = Actions.RAISE, + ... critical = myhandler) + + Parse a file: + + >>> files = dlisio.load(path) + RuntimeError: "...." + >>> handler = ErrorHandler(critical=Actions.LOG_ERROR) + >>> files = dlisio.load(path, errorhandler=handler) + [ERROR] "...." + >>> for f in files: + ... pass + + """ + @staticmethod + def raise_msg(msg): + raise RuntimeError(msg) + + @staticmethod + def swallow(msg): + pass + + def __init__(self, info = Actions.LOG_DEBUG, + minor = Actions.LOG_INFO, + major = Actions.LOG_WARNING, + critical = Actions.RAISE): + + core.error_handler.__init__(self) + + self.info = info + self.minor = minor + self.major = major + self.critical = critical + + def log(self, severity, context, problem, spec, action): + msg = ErrorHandler.format_error( + severity, context, problem, spec, action) + + """ Overrides dl::error_handler::log """ + if severity == core.error_severity.info: handler = self.info + elif severity == core.error_severity.minor: handler = self.minor + elif severity == core.error_severity.major: handler = self.major + elif severity == core.error_severity.critical: handler = self.critical + else: + err='Unknown error severity. Original msg was: {}' + raise RuntimeError(err.format(msg)) + + handler(msg) + + @staticmethod + def format_severity(severity): + if severity == core.error_severity.info: return "info" + elif severity == core.error_severity.minor: return "minor" + elif severity == core.error_severity.major: return "major" + elif severity == core.error_severity.critical: return "critical" + else: return severity + + # TODO: check if that helps for named fileheader issue + # TODO: nested format + @staticmethod + def format_error(severity, context, problem, spec, action): + format = '\n{:<{align}} {}' + severity = ErrorHandler.format_severity(severity) + + problem = format.format('Problem:', problem, align=13) + context = format.format('Where:', context, align=13) + severity = format.format('Severity:', severity, align=13) + + if spec: + spec = format.format('RP66V1 ref:', spec, align=13) + + if action: + action = format.format('Action taken:', action, align=13) + + msg = "{}{}{}{}{}" + return msg.format(problem, context, severity, spec, action) diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index 3f1cfc00c..138db652d 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -325,13 +326,319 @@ dl::ident fingerprint(const std::string& type, return ref.fingerprint(); } +void assert_overflow(const char* ptr, const char* end, int skip) { + if (ptr + skip > end) { + const auto msg = "corrupted record: fmtstr would read past end"; + throw std::runtime_error(msg); + } +} + +void read_curve_sample(const char* f, const char*& ptr, const char* end, + unsigned char*& dst) +{ + /* + * Reads to dst buffer a singular curve where: + * f - current fmt character + * ptr - pointer to current position in source buffer (file record) + * end - pointer to end of the source buffer (record end) + * dst - pointer to current position in destination buffer + */ + + /* + * Supporting bounded-length identifiers in frame data is + * slightly more difficult than it immediately seem like, and + * this implementation relies on a few assumptions that may not + * hold. + * + * 1. numpy structured arrays interpret unicode on the fly + * + * On my amd64 linux: + * >>> dt = np.dtype('U5') + * >>> dt.itemsize + * 20 + * >>> np.array(['foo'], dtype = dt)[0] + * 'foo' + * >>> np.array(['foobar'], dtype = dt)[0] + * 'fooba' + * + * Meaning it supports string lengths of [0, n]. It apparently + * (and maybe rightly so) uses null termination, or the bounded + * length, which ever comes first. + * + * 2. numpy stores characters as int32 Py_UNICODE + * Numpy seems to always use uint32, and not Py_UNICODE, which + * can be both 16 and 32 bits [1]. Since it's an integer it's + * endian sensitive, and widening from char works. This is not + * really documented by numpy. + * + * 3. numpy stores no metadata with the string + * It is assumed, and seems necessary from the interface, that + * there is no in-band metadata stored about the strings when + * used in structured arrays. This means we can just write the + * unicode ourselves, and have numpy interpret it correctly. + * + * -- + * Units is just an IDENT in disguise, so it can very well take + * the same code path. + * + * [1] http://docs.h5py.org/en/stable/strings.html#what-about-numpy-s-u-type + * NumPy also has a Unicode type, a UTF-32 fixed-width + * format (4-byte characters). HDF5 has no support for wide + * characters. Rather than trying to hack around this and + * “pretend” to support it, h5py will raise an error when + * attempting to create datasets or attributes of this + * type. + * + */ + auto swap_pointer = [&](py::object obj) + { + PyObject* p; + std::memcpy(&p, dst, sizeof(p)); + Py_DECREF(p); + p = obj.inc_ref().ptr(); + std::memcpy(dst, &p, sizeof(p)); + dst += sizeof(p); + }; + + if (*f == DLIS_FMT_FSING1) { + float v; + float a; + ptr = dlis_fsing1(ptr, &v, &a); + auto t = py::make_tuple(v, a); + + swap_pointer(t); + return; + } + + if (*f == DLIS_FMT_FSING2) { + float v; + float a; + float b; + ptr = dlis_fsing2(ptr, &v, &a, &b); + auto t = py::make_tuple(v, a, b); + + swap_pointer(t); + return; + } + + if (*f == DLIS_FMT_FDOUB1) { + double v; + double a; + ptr = dlis_fdoub1(ptr, &v, &a); + auto t = py::make_tuple(v, a); + + swap_pointer(t); + return; + } + + if (*f == DLIS_FMT_FDOUB2) { + double v; + double a; + double b; + ptr = dlis_fdoub2(ptr, &v, &a, &b); + auto t = py::make_tuple(v, a, b); + + swap_pointer(t); + return; + } + + if (*f == DLIS_FMT_IDENT || *f == DLIS_FMT_UNITS) { + constexpr auto chars = 255; + constexpr auto ident_size = chars * sizeof(std::uint32_t); + + std::int32_t len; + char tmp[chars]; + ptr = dlis_ident(ptr, &len, tmp); + + /* + * From reading the numpy source, it looks like they put + * and interpret the unicode buffer in the array directly, + * and pad with zero. This means the string is both null + * and length terminated, whichever comes first. + */ + std::memset(dst, 0, ident_size); + for (auto i = 0; i < len; ++i) { + const auto x = std::uint32_t(tmp[i]); + std::memcpy(dst + i * sizeof(x), &x, sizeof(x)); + } + dst += ident_size; + return; + } + + if (*f == DLIS_FMT_ASCII) { + std::int32_t len; + ptr = dlis_uvari(ptr, &len); + auto ascii = py::str(ptr, len); + ptr += len; + + /* + * Numpy seems to default initalize object types even in + * the case of np.empty to None [1]. The refcount is surely + * increased, so decref it before replacing the pointer + * with a fresh str. + * + * [1] Array of uninitialized (arbitrary) data of the given + * shape, dtype, and order. Object arrays will be + * initialized to None. + * https://docs.scipy.org/doc/numpy/reference/generated/numpy.empty.html + */ + swap_pointer(ascii); + return; + } + + if (*f == DLIS_FMT_OBNAME) { + std::int32_t origin; + std::uint8_t copy; + std::int32_t idlen; + char id[255]; + ptr = dlis_obname(ptr, &origin, ©, &idlen, id); + + const auto name = dl::obname { + dl::origin(origin), + dl::ushort(copy), + dl::ident(std::string(id, idlen)), + }; + + swap_pointer(py::cast(name)); + return; + } + + if (*f == DLIS_FMT_OBJREF) { + std::int32_t idlen; + char id[255]; + std::int32_t origin; + std::uint8_t copy; + std::int32_t objnamelen; + char objname[255]; + ptr = dlis_objref(ptr, + &idlen, + id, + &origin, + ©, + &objnamelen, + objname); + + const auto name = dl::objref { + dl::ident(std::string(id, idlen)), + dl::obname { + dl::origin(origin), + dl::ushort(copy), + dl::ident(std::string(objname, objnamelen)), + }, + }; + + swap_pointer(py::cast(name)); + return; + } + + if (*f == DLIS_FMT_ATTREF) { + std::int32_t id1len; + char id1[255]; + std::int32_t origin; + std::uint8_t copy; + std::int32_t objnamelen; + char objname[255]; + std::int32_t id2len; + char id2[255]; + ptr = dlis_attref(ptr, + &id1len, + id1, + &origin, + ©, + &objnamelen, + objname, + &id2len, + id2); + + const auto ref = dl::attref { + dl::ident(std::string(id1, id1len)), + dl::obname { + dl::origin(origin), + dl::ushort(copy), + dl::ident(std::string(objname, objnamelen)), + }, + dl::ident(std::string(id2, id2len)), + }; + + swap_pointer(py::cast(ref)); + return; + } + + if (*f == DLIS_FMT_DTIME) { + int Y, TZ, M, D, H, MN, S, MS; + ptr = dlis_dtime(ptr, &Y, &TZ, &M, &D, &H, &MN, &S, &MS); + Y = dlis_year(Y); + const auto US = MS * 1000; + + PyObject* p; + std::memcpy(&p, dst, sizeof(p)); + Py_DECREF(p); + p = PyDateTime_FromDateAndTime(Y, M, D, H, MN, S, US); + if (!p) throw py::error_already_set(); + std::memcpy(dst, &p, sizeof(p)); + dst += sizeof(p); + return; + } + + int src_skip, dst_skip; + const char localfmt[] = {*f, '\0'}; + dlis_packflen(localfmt, ptr, &src_skip, &dst_skip); + assert_overflow(ptr, end, src_skip); + dlis_packf(localfmt, ptr, dst); + dst += dst_skip; + ptr += src_skip; +} + +void read_fdata_frame(const char* fmt, const char*& ptr, const char* end, + unsigned char*& dst) noexcept (false) { + for (auto* f = fmt; *f; ++f) { + read_curve_sample(f, ptr, end, dst); + } +} + +void read_fdata_record(const char* pre_fmt, + const char* fmt, + const char* post_fmt, + const char* ptr, + const char* end, + unsigned char*& dst, + int& frames, + const std::size_t& itemsize, + std::size_t allocated_rows, + std::function resize) +noexcept (false) { + + /* get frame number and slots */ + while (ptr < end) { + if (frames == allocated_rows) { + resize(frames * 2); + dst += (frames * itemsize); + } + + int src_skip, dst_skip; + + dlis_packflen(pre_fmt, ptr, &src_skip, nullptr); + assert_overflow(ptr, end, src_skip); + ptr += src_skip; + + read_fdata_frame(fmt, ptr, end, dst); + + dlis_packflen(post_fmt, ptr, &src_skip, nullptr); + assert_overflow(ptr, end, src_skip); + ptr += src_skip; + + ++frames; + } +} + py::object read_fdata(const char* pre_fmt, const char* fmt, const char* post_fmt, dl::stream& file, const std::vector< long long >& indices, std::size_t itemsize, - py::object alloc) + py::object alloc, + dl::error_handler& errorhandler) noexcept (false) { // TODO: reverse fingerprint to skip bytes ahead-of-time /* @@ -382,6 +689,7 @@ noexcept (false) { dst = static_cast< unsigned char* >(info.ptr); }; + /* * The frameno is a part of the dtype, and only frame.channels is allowed * to call this function. If pre/post format is set, wrong data is read. @@ -392,13 +700,33 @@ noexcept (false) { assert(std::string(pre_fmt) == ""); assert(std::string(post_fmt) == ""); + auto record_dst = dst; + + const auto handle = [&]( const std::string& problem ) { + const auto context = "dl::read_fdata: reading curves"; + errorhandler.log(dl::error_severity::CRITICAL, context, problem, "", + "Record is skipped"); + // we update the buffer as we go. Hence if error happened we need to + // go back and start rewriting updated data + dst = record_dst; + }; + int frames = 0; for (auto i : indices) { + record_dst = dst; + /* get record */ - auto record = dl::extract(file, i); + dl::record record; + try { + record = dl::extract(file, i, errorhandler); + } catch (std::exception& e) { + handle(e.what()); + continue; + } if (record.isencrypted()) { - throw dl::not_implemented("encrypted FDATA record"); + handle("encrypted FDATA record"); + continue; } const auto* ptr = record.data.data(); @@ -409,275 +737,12 @@ noexcept (false) { std::uint8_t copy; ptr = dlis_obname(ptr, &origin, ©, nullptr, nullptr); - /* get frame number and slots */ - while (ptr < end) { - if (frames == allocated_rows) { - resize(frames * 2); - dst += (frames * itemsize); - } - - auto assert_overflow = [end](const char* ptr, int skip) { - if (ptr + skip > end) { - const auto msg = "corrupted record: fmtstr would read past end"; - throw std::runtime_error(msg); - } - }; - - int src_skip, dst_skip; - dlis_packflen(pre_fmt, ptr, &src_skip, nullptr); - assert_overflow(ptr, src_skip); - ptr += src_skip; - - for (auto* f = fmt; *f; ++f) { - /* - * Supporting bounded-length identifiers in frame data is - * slightly more difficult than it immediately seem like, and - * this implementation relies on a few assumptions that may not - * hold. - * - * 1. numpy structured arrays interpret unicode on the fly - * - * On my amd64 linux: - * >>> dt = np.dtype('U5') - * >>> dt.itemsize - * 20 - * >>> np.array(['foo'], dtype = dt)[0] - * 'foo' - * >>> np.array(['foobar'], dtype = dt)[0] - * 'fooba' - * - * Meaning it supports string lengths of [0, n]. It apparently - * (and maybe rightly so) uses null termination, or the bounded - * length, which ever comes first. - * - * 2. numpy stores characters as int32 Py_UNICODE - * Numpy seems to always use uint32, and not Py_UNICODE, which - * can be both 16 and 32 bits [1]. Since it's an integer it's - * endian sensitive, and widening from char works. This is not - * really documented by numpy. - * - * 3. numpy stores no metadata with the string - * It is assumed, and seems necessary from the interface, that - * there is no in-band metadata stored about the strings when - * used in structured arrays. This means we can just write the - * unicode ourselves, and have numpy interpret it correctly. - * - * -- - * Units is just an IDENT in disguise, so it can very well take - * the same code path. - * - * [1] http://docs.h5py.org/en/stable/strings.html#what-about-numpy-s-u-type - * NumPy also has a Unicode type, a UTF-32 fixed-width - * format (4-byte characters). HDF5 has no support for wide - * characters. Rather than trying to hack around this and - * “pretend” to support it, h5py will raise an error when - * attempting to create datasets or attributes of this - * type. - * - */ - auto swap_pointer = [&](py::object obj) - { - PyObject* p; - std::memcpy(&p, dst, sizeof(p)); - Py_DECREF(p); - p = obj.inc_ref().ptr(); - std::memcpy(dst, &p, sizeof(p)); - dst += sizeof(p); - }; - - if (*f == DLIS_FMT_FSING1) { - float v; - float a; - ptr = dlis_fsing1(ptr, &v, &a); - auto t = py::make_tuple(v, a); - - swap_pointer(t); - continue; - } - - if (*f == DLIS_FMT_FSING2) { - float v; - float a; - float b; - ptr = dlis_fsing2(ptr, &v, &a, &b); - auto t = py::make_tuple(v, a, b); - - swap_pointer(t); - continue; - } - - if (*f == DLIS_FMT_FDOUB1) { - double v; - double a; - ptr = dlis_fdoub1(ptr, &v, &a); - auto t = py::make_tuple(v, a); - - swap_pointer(t); - continue; - } - - if (*f == DLIS_FMT_FDOUB2) { - double v; - double a; - double b; - ptr = dlis_fdoub2(ptr, &v, &a, &b); - auto t = py::make_tuple(v, a, b); - - swap_pointer(t); - continue; - } - - if (*f == DLIS_FMT_IDENT || *f == DLIS_FMT_UNITS) { - constexpr auto chars = 255; - constexpr auto ident_size = chars * sizeof(std::uint32_t); - - std::int32_t len; - char tmp[chars]; - ptr = dlis_ident(ptr, &len, tmp); - - /* - * From reading the numpy source, it looks like they put - * and interpret the unicode buffer in the array directly, - * and pad with zero. This means the string is both null - * and length terminated, whichever comes first. - */ - std::memset(dst, 0, ident_size); - for (auto i = 0; i < len; ++i) { - const auto x = std::uint32_t(tmp[i]); - std::memcpy(dst + i * sizeof(x), &x, sizeof(x)); - } - dst += ident_size; - continue; - } - - if (*f == DLIS_FMT_ASCII) { - std::int32_t len; - ptr = dlis_uvari(ptr, &len); - auto ascii = py::str(ptr, len); - ptr += len; - - /* - * Numpy seems to default initalize object types even in - * the case of np.empty to None [1]. The refcount is surely - * increased, so decref it before replacing the pointer - * with a fresh str. - * - * [1] Array of uninitialized (arbitrary) data of the given - * shape, dtype, and order. Object arrays will be - * initialized to None. - * https://docs.scipy.org/doc/numpy/reference/generated/numpy.empty.html - */ - swap_pointer(ascii); - continue; - } - - if (*f == DLIS_FMT_OBNAME) { - std::int32_t origin; - std::uint8_t copy; - std::int32_t idlen; - char id[255]; - ptr = dlis_obname(ptr, &origin, ©, &idlen, id); - - const auto name = dl::obname { - dl::origin(origin), - dl::ushort(copy), - dl::ident(std::string(id, idlen)), - }; - - swap_pointer(py::cast(name)); - continue; - } - - if (*f == DLIS_FMT_OBJREF) { - std::int32_t idlen; - char id[255]; - std::int32_t origin; - std::uint8_t copy; - std::int32_t objnamelen; - char objname[255]; - ptr = dlis_objref(ptr, - &idlen, - id, - &origin, - ©, - &objnamelen, - objname); - - const auto name = dl::objref { - dl::ident(std::string(id, idlen)), - dl::obname { - dl::origin(origin), - dl::ushort(copy), - dl::ident(std::string(objname, objnamelen)), - }, - }; - - swap_pointer(py::cast(name)); - continue; - } - - if (*f == DLIS_FMT_ATTREF) { - std::int32_t id1len; - char id1[255]; - std::int32_t origin; - std::uint8_t copy; - std::int32_t objnamelen; - char objname[255]; - std::int32_t id2len; - char id2[255]; - ptr = dlis_attref(ptr, - &id1len, - id1, - &origin, - ©, - &objnamelen, - objname, - &id2len, - id2); - - const auto ref = dl::attref { - dl::ident(std::string(id1, id1len)), - dl::obname { - dl::origin(origin), - dl::ushort(copy), - dl::ident(std::string(objname, objnamelen)), - }, - dl::ident(std::string(id2, id2len)), - }; - - swap_pointer(py::cast(ref)); - continue; - } - - if (*f == DLIS_FMT_DTIME) { - int Y, TZ, M, D, H, MN, S, MS; - ptr = dlis_dtime(ptr, &Y, &TZ, &M, &D, &H, &MN, &S, &MS); - Y = dlis_year(Y); - const auto US = MS * 1000; - - PyObject* p; - std::memcpy(&p, dst, sizeof(p)); - Py_DECREF(p); - p = PyDateTime_FromDateAndTime(Y, M, D, H, MN, S, US); - if (!p) throw py::error_already_set(); - std::memcpy(dst, &p, sizeof(p)); - dst += sizeof(p); - continue; - } - - const char localfmt[] = {*f, '\0'}; - dlis_packflen(localfmt, ptr, &src_skip, &dst_skip); - assert_overflow(ptr, src_skip); - dlis_packf(localfmt, ptr, dst); - dst += dst_skip; - ptr += src_skip; - } - - dlis_packflen(post_fmt, ptr, &src_skip, nullptr); - assert_overflow(ptr, src_skip); - ptr += src_skip; - - ++frames; + try { + read_fdata_record(pre_fmt, fmt, post_fmt, ptr, end, dst, frames, + itemsize, allocated_rows, resize); + } catch (std::exception& e) { + handle(e.what()); + continue; } } @@ -717,6 +782,30 @@ class Pymatcher : public dl::matcher { } }; +/** trampoline helper class for dl::error_handler bindings */ +class PyErrorHandler : public dl::error_handler { +public: + /* Inherit the constructor */ + using dl::error_handler::error_handler; + + /* Trampoline (need one for each virtual function) */ + void log(const dl::error_severity& level, const std::string& context, + const std::string& problem, const std::string& specification, + const std::string& action) + const noexcept(false) override { + PYBIND11_OVERLOAD_PURE( + void, /* Return type */ + dl::error_handler, /* Parent class */ + log, /* Name of function in C++ */ + level, /* Argument(s) */ + context, + problem, + specification, + action + ); + } +}; + } PYBIND11_MAKE_OPAQUE( std::vector< dl::object_set > ) @@ -852,11 +941,13 @@ PYBIND11_MODULE(core, m) { py::class_< dl::object_attribute >( m, "object_attribute" ) .def_readonly("value", &dl::object_attribute::value) .def_readonly("units", &dl::object_attribute::units) + .def_readonly("log", &dl::object_attribute::log) ; py::class_< dl::basic_object >( m, "basic_object" ) .def_readonly("type", &dl::basic_object::type) .def_readonly("name", &dl::basic_object::object_name) + .def_readonly("log", &dl::basic_object::log) .def( "__len__", []( const dl::basic_object& o ) { return o.attributes.size(); }) @@ -891,11 +982,13 @@ PYBIND11_MODULE(core, m) { .def( "get", (dl::object_vector (dl::pool::*) ( const std::string&, const std::string&, - const dl::matcher& + const dl::matcher&, + const dl::error_handler& )) &dl::pool::get ) .def( "get", (dl::object_vector (dl::pool::*) ( const std::string&, - const dl::matcher& + const dl::matcher&, + const dl::error_handler& )) &dl::pool::get ) ; @@ -969,11 +1062,21 @@ PYBIND11_MODULE(core, m) { ; m.def( "extract", [](dl::stream& s, - const std::vector< long long >& tells) { + const std::vector< long long >& tells, + dl::error_handler& errorhandler) { std::vector< dl::record > recs; recs.reserve( tells.size() ); for (auto tell : tells) { - auto rec = dl::extract(s, tell); + dl::record rec; + try { + rec = dl::extract(s, tell, errorhandler); + } catch (const std::exception& e) { + const auto context = + "dl::extract: Reading raw bytes from record"; + errorhandler.log(dl::error_severity::CRITICAL, context, + e.what(), "", "Record is skipped"); + continue; + } if (rec.data.size() > 0) { recs.push_back( std::move( rec ) ); } @@ -981,11 +1084,21 @@ PYBIND11_MODULE(core, m) { return recs; }); - m.def( "parse_objects", []( const std::vector< dl::record >& recs ) { + m.def( "parse_objects", []( const std::vector< dl::record >& recs, + dl::error_handler& errorhandler ) { std::vector< dl::object_set > objects; for (const auto& rec : recs) { if (rec.isencrypted()) continue; - objects.push_back( dl::object_set( rec ) ); + + try { + objects.push_back( dl::object_set( rec ) ); + } catch (const std::exception& e) { + const auto context = + "core.parse_objects: Construct object sets"; + errorhandler.log(dl::error_severity::CRITICAL, context, + e.what(), "", "Set is skipped"); + continue; + } } return objects; }); @@ -995,15 +1108,36 @@ PYBIND11_MODULE(core, m) { m.def( "hastapemark", dl::hastapemark ); m.def("findfdata", dl::findfdata); - m.def( "findoffsets", []( dl::stream& file ) { - const auto ofs = dl::findoffsets( file ); - return py::make_tuple( ofs.explicits, ofs.implicits ); + m.def( "findoffsets", []( dl::stream& file, + dl::error_handler& errorhandler) { + const auto ofs = dl::findoffsets( file, errorhandler ); + return py::make_tuple( ofs.explicits, ofs.implicits, ofs.broken ); }); - m.def("set_encodings", set_encodings); - m.def("get_encodings", get_encodings); + py::enum_< dl::error_severity >( m, "error_severity" ) + .value( "info", dl::error_severity::INFO ) + .value( "minor", dl::error_severity::MINOR ) + .value( "major", dl::error_severity::MAJOR ) + .value( "critical", dl::error_severity::CRITICAL ) + ; + + py::class_< dl::dlis_error >( m, "dlis_error" ) + .def_readonly( "severity", &dl::dlis_error::severity ) + .def_readonly( "problem", &dl::dlis_error::problem ) + .def_readonly( "specification", &dl::dlis_error::specification ) + .def_readonly( "action", &dl::dlis_error::action ) + ; py::class_< dl::matcher, Pymatcher >( m, "matcher") .def(py::init<>()) ; + + py::class_< dl::error_handler, PyErrorHandler >( m, "error_handler") + .def(py::init<>()) + ; + + /* settings */ + m.def("set_encodings", set_encodings); + m.def("get_encodings", get_encodings); + } diff --git a/python/dlisio/file.py b/python/dlisio/file.py index 313344319..f0366ac9b 100644 --- a/python/dlisio/file.py +++ b/python/dlisio/file.py @@ -95,11 +95,12 @@ class logicalfile(object): then the users responsibility of ensuring correctness for the custom class. """ - def __init__(self, stream, object_pool, fdata_index, sul=None): + def __init__(self, stream, object_pool, fdata_index, sul, error_handler): self.file = stream self.object_pool = object_pool self.sul = sul self.fdata_index = fdata_index + self.error_handler = error_handler if 'UPDATE' in self.object_pool.types: msg = ('{} contains UPDATE-object(s) which changes other ' @@ -393,9 +394,9 @@ def find(self, objecttype, objectname=None, matcher=None): matcher = settings.regex if not objectname: - attics = self.object_pool.get(objecttype, matcher) + attics = self.object_pool.get(objecttype, matcher, self.error_handler) else: - attics = self.object_pool.get(objecttype, objectname, matcher) + attics = self.object_pool.get(objecttype, objectname, matcher, self.error_handler) objects = [] for attic in attics: @@ -451,8 +452,8 @@ def object(self, type, name, origin=None, copynr=None): if len(matches) == 0: msg = "Object not found: type={}, name={}, origin={}, copynumber={}" - if not origin: origin = "'any'" - if not copynr: copynr = "'any'" + if origin is None: origin = "'any'" + if copynr is None: copynr = "'any'" raise ValueError(msg.format(type, name, origin, copynr)) diff --git a/python/dlisio/load.py b/python/dlisio/load.py index bcab6713d..1c3d49850 100644 --- a/python/dlisio/load.py +++ b/python/dlisio/load.py @@ -1,5 +1,6 @@ from . import core from .file import physicalfile, logicalfile +from .errors import ErrorHandler def open(path): """ Open a file @@ -21,7 +22,7 @@ def open(path): """ return core.open(str(path)) -def load(path): +def load(path, error_handler = None): """ Loads a file and returns one filehandle pr logical file. The dlis standard have a concept of logical files. A logical file is a @@ -48,6 +49,11 @@ def load(path): path : str_like + error_handler : dlisio.errors.ErrorHandler, optional + Error handling rules. Default rules will apply if none supplied. + Handler will be added to all the logical files, so users may modify + the behavior at any time. + Examples -------- @@ -75,6 +81,9 @@ def load(path): dlis : dlisio.physicalfile(dlisio.logicalfile) """ + if not error_handler: + error_handler = ErrorHandler() + sulsize = 80 tifsize = 12 lfs = [] @@ -117,19 +126,26 @@ def rewind(offset, tif): if tapemarks: stream = core.open_tif(stream) stream = core.open_rp66(stream) - explicits, implicits = core.findoffsets(stream) + explicits, implicits, broken = core.findoffsets(stream, error_handler) hint = rewind(stream.absolute_tell, tapemarks) - recs = core.extract(stream, explicits) - sets = core.parse_objects(recs) + recs = core.extract(stream, explicits, error_handler) + sets = core.parse_objects(recs, error_handler) pool = core.pool(sets) - fdata = core.findfdata(stream, implicits) + fdata = core.findfdata(stream, implicits, error_handler) - lf = logicalfile(stream, pool, fdata, sul) + lf = logicalfile(stream, pool, fdata, sul, error_handler) lfs.append(lf) + if len(broken): + # do not attempt to recover or read more logical files + # if the error happened in findoffsets + # return all logical files we were able to process until now + break + + stream = core.open(path) + try: - stream = core.open(path) offset = core.findvrl(stream, hint) except RuntimeError: if stream.eof(): diff --git a/python/dlisio/plumbing/basicobject.py b/python/dlisio/plumbing/basicobject.py index 5adb2a93a..4bef50b1b 100644 --- a/python/dlisio/plumbing/basicobject.py +++ b/python/dlisio/plumbing/basicobject.py @@ -196,11 +196,32 @@ def __getitem__(self, key): # No rule for parsing, keep rp66value as is, i.e. vector parse_as = vector + # report errors before checking for key presence - it might be a symptom + if len(self.attic.log) > 0: + # TODO: here and for attribute: we use fingerprint to report + # context, not repr, to have origina and copynr, but is fingerprint + # clear enough for the user? + context = "{}".format(self.fingerprint) + for error in self.attic.log: + self.logicalfile.error_handler.log( + error.severity, context, error.problem, + error.specification, error.action) + try: - rp66value = self.attic[key].value + attribute = self.attic[key] except KeyError: return defaultvalue(parse_as) + rp66value = attribute.value + + # already report errors. presence of key in the attic is enough + if len(attribute.log) > 0: + context = "{}-A.{}".format(self.fingerprint, key) + for error in attribute.log: + self.logicalfile.error_handler.log( + error.severity, context, error.problem, + error.specification, error.action) + if rp66value is None: return defaultvalue(parse_as) if rp66value == [] : return defaultvalue(parse_as) diff --git a/python/docs/dlisio.rst b/python/docs/dlisio.rst index 4deeaa397..8ed91725f 100644 --- a/python/docs/dlisio.rst +++ b/python/docs/dlisio.rst @@ -3,6 +3,11 @@ Strings and encodings .. autofunction:: dlisio.set_encodings .. autofunction:: dlisio.get_encodings +Error handling +============== +.. autoclass:: dlisio.errors.ErrorHandler() +.. autoclass:: dlisio.errors.Actions() + Open and Load ============= .. autofunction:: dlisio.load diff --git a/python/setup.py b/python/setup.py index 469ce75d8..a335a2201 100755 --- a/python/setup.py +++ b/python/setup.py @@ -52,7 +52,7 @@ def getversion(): description = 'DLIS v1', long_description = 'DLIS v1', url = 'https://github.com/equinor/dlisio', - packages = ['dlisio', 'dlisio.plumbing'], + packages = ['dlisio', 'dlisio.plumbing', 'dlisio.errors'], license = 'LGPL-3.0', platforms = 'any', install_requires = ['numpy'], diff --git a/python/tests/test_curves_reprcodes.py b/python/tests/test_curves_reprcodes.py index 4d4209151..534b74b2a 100644 --- a/python/tests/test_curves_reprcodes.py +++ b/python/tests/test_curves_reprcodes.py @@ -385,3 +385,9 @@ def test_ascii_broken(): def test_ascii_broken_utf8(): fpath = 'data/chap4-7/iflr/broken-utf8-ascii.dlis' _ = load_curves(fpath) + +def test_fmt_broken(): + fpath = 'data/chap4-7/iflr/broken-fmt.dlis' + with pytest.raises(RuntimeError) as exc: + _ = load_curves(fpath) + assert "fmtstr would read past end" in str(exc.value) diff --git a/python/tests/test_error_handler.py b/python/tests/test_error_handler.py new file mode 100644 index 000000000..1855489d9 --- /dev/null +++ b/python/tests/test_error_handler.py @@ -0,0 +1,269 @@ +""" +Testing error handler on various levels of representation +(user-facade functionality and underlying logical format) +""" + +import pytest +import os +import numpy as np + +import dlisio + +from dlisio.errors import ErrorHandler, Actions + +errorhandler = ErrorHandler(critical = Actions.LOG_ERROR) + +def test_custom_action(): + def custom(msg): + raise ValueError("custom message: "+msg) + + path = 'data/chap2/truncated-in-lrsh-vr-over.dlis' + errorhandler = ErrorHandler(critical=custom) + with pytest.raises(ValueError) as excinfo: + _ = dlisio.load(path, error_handler=errorhandler) + assert "custom message: \n" in str(excinfo.value) + assert "File truncated in Logical Record Header" in str(excinfo.value) + +def test_unescapable_notdlis(assert_error): + path = 'data/chap2/nondlis.txt' + with pytest.raises(RuntimeError) as excinfo: + _ = dlisio.load(path, error_handler=errorhandler) + assert "could not find visible record envelope" in str(excinfo.value) + +def test_truncated_in_data(assert_error): + path = 'data/chap2/truncated-in-second-lr.dlis' + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + assert_error("File truncated in Logical Record Segment") + assert len(f.channels) == 1 + +def test_tif_truncated_in_data(assert_error): + path = 'data/tif/layout/truncated-in-data.dlis' + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + assert_error("File truncated in Logical Record Segment") + +def test_truncated_in_lrsh(assert_error): + path = 'data/chap2/truncated-in-lrsh.dlis' + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + assert_error("unexpected EOF when reading record") + assert len(f.channels) == 1 + +def test_truncated_after_lrsh_new_lf(assert_error): + path = 'data/chap2/lf-truncated-after-lrsh.dlis' + with dlisio.load(path, error_handler=errorhandler) as batch: + assert_error("File truncated in Logical Record Segment") + assert len(batch) == 2 + +def test_truncated_lr_missing_lrs_vr_over(assert_error): + path = 'data/chap2/truncated-lr-no-lrs-vr-over.dlis' + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + assert_error("last logical record segment expects successor") + assert len(f.channels) == 0 + +def test_zeroed_before_lrs(assert_error): + path = 'data/chap2/zeroed-in-1st-lr.dlis' + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + assert_error("Too short logical record") + assert len(f.channels) == 0 + +def test_zeroed_before_vr(assert_error): + path = 'data/chap2/zeroed-in-2nd-lr.dlis' + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + assert_error("Incorrect format version") + assert len(f.channels) == 1 + +def test_extract_broken_padbytes(assert_error): + path = 'data/chap2/padbytes-bad.dlis' + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + assert_error("bad segment trim") + valid_obj = f.object("VALID-SET", "VALID-OBJ", 10, 0) + assert valid_obj + +def test_findfdata_bad_obname(assert_error): + path = 'data/chap3/implicit/fdata-broken-obname.dlis' + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + assert_error("fdata record corrupted, error on reading obname") + assert "T.FRAME-I.DLIS-FRAME-O.3-C.1" in f.fdata_index + +def test_curves_broken_fmt(assert_error): + path = 'data/chap4-7/iflr/broken-fmt.dlis' + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + frame = f.object('FRAME', 'FRAME-REPRCODE', 10, 0) + curves = frame.curves() + assert_error("fmtstr would read past end") + assert np.array_equal(curves['FRAMENO'], np.array([1, 3])) + +def test_parse_objects_unexpected_attribute_in_set(assert_error): + path = 'data/chap3/explicit/broken-in-set.dlis' + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + assert_error("Construct object sets") + _ = f.object('VALID-SET', 'VALID-OBJ', 10, 0) + _ = f.object('GOOOD-SET', 'VALID-OBJ', 10, 0) + +def test_parse_critical_escaped(tmpdir, merge_files_oneLR, + assert_error): + path = os.path.join(str(tmpdir), 'error-on-attribute-access.dlis') + content = [ + 'data/chap3/start.dlis.part', + 'data/chap3/template/invalid-repcode-no-value.dlis.part', + 'data/chap3/object/object.dlis.part', + 'data/chap3/objattr/empty.dlis.part', + ] + merge_files_oneLR(path, content) + + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) + # value is unclear and shouldn't be trusted + _ = obj['INVALID'] + assert_error("invalid representation code") + + +def test_parse_unparsable_record(tmpdir, merge_files_oneLR, assert_error): + path = os.path.join(str(tmpdir), 'unparsable.dlis') + content = [ + 'data/chap3/start.dlis.part', + 'data/chap3/template/invalid-repcode-no-value.dlis.part', + 'data/chap3/object/object.dlis.part', + 'data/chap3/objattr/empty.dlis.part', + 'data/chap3/object/object2.dlis.part', + # here must go anything that will be considered unrecoverable + 'data/chap3/objattr/reprcode-invalid-value.dlis.part' + ] + merge_files_oneLR(path, content) + + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) + assert_error("Action taken: object set parse has been interrupted") + + # value is unclear and shouldn't be trusted + _ = obj['INVALID'] + assert_error("invalid representation code") + + with pytest.raises(ValueError) as excinfo: + _ = f.object('VERY_MUCH_TESTY_SET', 'OBJECT2', 1, 1) + assert "not found" in str(excinfo.value) + +def test_parse_major_errored(tmpdir, merge_files_oneLR): + path = os.path.join(str(tmpdir), 'replacement-set.dlis') + content = [ + 'data/chap3/sul.dlis.part', + 'data/chap3/set/replacement.dlis.part', + 'data/chap3/template/default.dlis.part', + 'data/chap3/object/object.dlis.part' + ] + merge_files_oneLR(path, content) + + errorhandler = ErrorHandler( + major=Actions.RAISE) + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + with pytest.raises(RuntimeError) as excinfo: + _ = f.object('REPLACEMENT', 'OBJECT', 1, 1) + assert "Replacement sets are not supported" in str(excinfo.value) + +def test_parse_minor_errored(tmpdir, merge_files_oneLR): + path = os.path.join(str(tmpdir), 'redundant-set.dlis') + content = [ + 'data/chap3/sul.dlis.part', + 'data/chap3/set/redundant.dlis.part', + 'data/chap3/template/default.dlis.part', + 'data/chap3/object/object.dlis.part' + ] + merge_files_oneLR(path, content) + + errorhandler = ErrorHandler( + major=Actions.RAISE, + minor=Actions.RAISE + ) + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + with pytest.raises(RuntimeError) as excinfo: + _ = f.object('REDUNDANT', 'OBJECT', 1, 1) + assert "Redundant sets are not supported" in str(excinfo.value) + +def test_many_logical_files(): + path = "data/chap4-7/many-logical-files-error-in-last.dlis" + errorhandler = ErrorHandler() + errorhandler.critical = Actions.LOG_ERROR + + with dlisio.load(path, error_handler=errorhandler) as files: + # last file is not processed + assert len(files) == 2 + + errorhandler.critical = Actions.RAISE + errorhandler.major = Actions.RAISE + errorhandler.minor = Actions.RAISE + for f in files: + with pytest.raises(RuntimeError): + f.load() + + # define default error handler specially for the first logical file + errorhandler = ErrorHandler() + files[0].error_handler = errorhandler + + files[0].load() + with pytest.raises(RuntimeError): + files[1].load() + + +@pytest.fixture +def create_very_broken_file(tmpdir, merge_files_oneLR, merge_files_manyLR): + valid = os.path.join(str(tmpdir), 'valid.dlis') + content = [ + 'data/chap3/start.dlis.part', + 'data/chap3/template/invalid-repcode-no-value.dlis.part', + 'data/chap3/object/object.dlis.part', + # will cause issues on attribute access + 'data/chap3/objattr/empty.dlis.part', + 'data/chap3/object/object2.dlis.part', + # will cause issues on parsing + 'data/chap3/objattr/reprcode-invalid-value.dlis.part' + ] + merge_files_oneLR(valid, content) + + content = [ + valid, + 'data/chap3/sul.dlis.part', # will cause issues on load + ] + + def create_file(path): + merge_files_manyLR(path, content) + return create_file + + +def test_complex(create_very_broken_file, tmpdir): + + path = os.path.join(str(tmpdir), 'complex.dlis') + create_very_broken_file(path) + + errorhandler = ErrorHandler() + + with pytest.raises(RuntimeError): + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + pass + + # escape errors on load + errorhandler.critical = Actions.LOG_ERROR + with dlisio.load(path, error_handler=errorhandler) as (f, *_): + + # fail again on parsing objects not parsed on load + errorhandler.critical = Actions.RAISE + with pytest.raises(RuntimeError) as excinfo: + _ = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) + assert "object set" in str(excinfo.value) + + # escape errors on parsing + errorhandler.critical = Actions.LOG_ERROR + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) + + errorhandler.critical = Actions.RAISE + # set is parsed, but user should get error anyway + with pytest.raises(RuntimeError) as excinfo: + _ = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) + assert "object set" in str(excinfo.value) + + # fail on attribute access + errorhandler.critical = Actions.RAISE + with pytest.raises(RuntimeError): + _ = obj['INVALID'] + + # retrieve whatever value from errored attribute + errorhandler.critical = Actions.LOG_ERROR + _ = obj['INVALID'] diff --git a/python/tests/test_helpers.py b/python/tests/test_helpers.py index 9b3c9a87b..4f9c97e44 100644 --- a/python/tests/test_helpers.py +++ b/python/tests/test_helpers.py @@ -56,7 +56,7 @@ def test_record_attributes(): stream = core.open('data/chap2/3lr-in-vr-one-encrypted.dlis', zero=80) stream = core.open_rp66(stream) explicits = [0, 32, 64] - recs = core.extract(stream, explicits) + recs = core.extract(stream, explicits, dlisio.errors.ErrorHandler()) buffer = bytearray(1) diff --git a/python/tests/test_loading.py b/python/tests/test_loading.py index e54acdcfb..35cf71a2c 100644 --- a/python/tests/test_loading.py +++ b/python/tests/test_loading.py @@ -33,14 +33,24 @@ def test_filehandles_closed(tmpdir): many_logical = str(tmpdir.join('many_logical')) shutil.copyfile('data/chap4-7/many-logical-files.dlis', many_logical) + offsets_error_escape = str(tmpdir.join('error_escape_zeroed')) + shutil.copyfile('data/chap2/zeroed-in-1st-lr.dlis', offsets_error_escape) + with dlisio.load(tmp) as _: pass with dlisio.load(many_logical) as fls: assert len(fls) == 3 + # error happens in 1st LF, but is escaped + errorhandler = dlisio.errors.ErrorHandler( + critical=dlisio.errors.Actions.LOG_ERROR) + with dlisio.load(offsets_error_escape, error_handler=errorhandler): + pass + os.remove(tmp) os.remove(many_logical) + os.remove(offsets_error_escape) def test_filehandles_closed_when_load_fails(tmpdir): # Check that we don't leak open filehandles on failure diff --git a/python/tests/test_logical_file.py b/python/tests/test_logical_file.py index 842e96662..b0751ca6c 100644 --- a/python/tests/test_logical_file.py +++ b/python/tests/test_logical_file.py @@ -28,7 +28,8 @@ def test_object_unknown(f): def test_object_nonexisting(f): with pytest.raises(ValueError) as exc: _ = f.object("UNKNOWN_TYPE", "SOME_OBJECT", 0, 0) - assert "Object not found" in str(exc.value) + assert ("Object not found: type=UNKNOWN_TYPE, name=SOME_OBJECT, " + "origin=0, copynumber=0") in str(exc.value) with pytest.raises(ValueError): _ = f.object("CHANNEL", "CHANN1", 11, 0) @@ -61,6 +62,8 @@ def test_object_many_objects_nameonly(tmpdir_factory, merge_files_manyLR): with pytest.raises(ValueError) as exc: _ = f.object("CHANNEL", "MATCH1") assert "Candidates are" in str(exc.value) + assert "origin=16, copy=0" in str(exc.value) + assert "origin=127, copy=0" in str(exc.value) def test_object_ducplicated_object(tmpdir_factory, merge_files_manyLR): # Spec violation: Two objects with the same signature AND identical content diff --git a/python/tests/test_logical_record.py b/python/tests/test_logical_record.py index a9355ca3d..a58a5e248 100644 --- a/python/tests/test_logical_record.py +++ b/python/tests/test_logical_record.py @@ -63,8 +63,7 @@ def test_invariant_attribute(tmpdir, merge_files_oneLR): assert attr.value == [False, False, True] -@pytest.mark.future_warning_invariant_attribute -def test_invariant_attribute_in_object(tmpdir, merge_files_oneLR): +def test_invariant_attribute_in_object(tmpdir, merge_files_oneLR, assert_log): path = os.path.join(str(tmpdir), 'invariant-attribute-in-object.dlis') content = [ 'data/chap3/start.dlis.part', @@ -79,6 +78,9 @@ def test_invariant_attribute_in_object(tmpdir, merge_files_oneLR): attr = obj.attic['DEFAULT_ATTRIBUTE'] assert attr.value == [8.0] + _ = obj['DEFAULT_ATTRIBUTE'] + assert_log("Invariant attribute") + def test_absent_attribute(tmpdir, merge_files_oneLR): path = os.path.join(str(tmpdir), 'attribute_absent.dlis') @@ -98,8 +100,7 @@ def test_absent_attribute(tmpdir, merge_files_oneLR): _ = obj.attic['DEFAULT_ATTRIBUTE'] -@pytest.mark.future_warning_absent_attr_in_template -def test_absent_attribute_in_template(tmpdir, merge_files_oneLR): +def test_absent_attribute_in_template(tmpdir, merge_files_oneLR, assert_log): path = os.path.join(str(tmpdir), 'absent-attribute-in-template.dlis') content = [ 'data/chap3/start.dlis.part', @@ -112,6 +113,7 @@ def test_absent_attribute_in_template(tmpdir, merge_files_oneLR): with dlisio.load(path) as (f, *tail): obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) assert obj.attic['DEFAULT_ATTRIBUTE'].value + assert_log("Absent Attribute in object set template") @pytest.mark.future_test_attributes def test_global_default_attribute(tmpdir, merge_files_oneLR): @@ -255,7 +257,15 @@ def test_repcode_invalid_datetime(tmpdir, merge_files_oneLR): assert "month must be" in str(excinfo.value) def test_repcode_invalid_in_template_value(tmpdir, merge_files_oneLR): - path = os.path.join(str(tmpdir), 'invalid-repcode.dlis') + path = os.path.join(str(tmpdir), 'invalid-repcode-template-value.dlis') + + # template for attribute 'INVALID': + # repcode = 0 # invalid + # value = [bytes] # value is declared to be present + + # attribute 'INVALID' in obj 'OBJECT': + # repcode = USHORT # valid + # value = [1, 2, 3, 4] # valid value content = [ 'data/chap3/start.dlis.part', 'data/chap3/template/invalid-repcode-value.dlis.part', @@ -270,11 +280,21 @@ def test_repcode_invalid_in_template_value(tmpdir, merge_files_oneLR): assert "unknown representation code" in str(excinfo.value) -def test_repcode_invalid_in_template_no_value(tmpdir, merge_files_oneLR): - path = os.path.join(str(tmpdir), 'invalid-repcode-template.dlis') +def test_repcode_invalid_in_template_no_value_fixed(tmpdir, merge_files_oneLR, + assert_info): + path = os.path.join(str(tmpdir), 'invalid-repcode-template-fixed.dlis') + + # template for attribute 'INVALID': + # repcode = 0 # invalid + # value = None # no value + + # attribute 'INVALID' in obj 'OBJECT': + # repcode = USHORT # valid + # value = [1, 2, 3, 4] # valid value content = [ 'data/chap3/start.dlis.part', 'data/chap3/template/invalid-repcode-no-value.dlis.part', + 'data/chap3/template/default.dlis.part', 'data/chap3/object/object.dlis.part', 'data/chap3/objattr/all-set.dlis.part', ] @@ -282,17 +302,101 @@ def test_repcode_invalid_in_template_no_value(tmpdir, merge_files_oneLR): with dlisio.load(path) as (f, *_): obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) + + _ = obj['DEFAULT_ATTRIBUTE'] + assert_info("Problem: One or more attributes " + "of this object violate specification") + assert_info("Where: T.VERY_MUCH_TESTY_SET-I.OBJECT-O.1-C.1") + attr = obj.attic['INVALID'] assert attr.value == [1, 2, 3, 4] + _ = obj['INVALID'] + assert_info("Problem: Invalid representation code 0\n" + "Where: " + "T.VERY_MUCH_TESTY_SET-I.OBJECT-O.1-C.1-A.INVALID") + +def test_repcode_invalid_in_template_no_value_not_fixed(tmpdir, + merge_files_oneLR, + assert_info): + path = os.path.join(str(tmpdir), 'invalid-repcode-template-bad.dlis') + + + # template for attribute 'INVALID': + # repcode = 0 # invalid + # value = None # no value + + # attribute 'INVALID' in obj 'OBJECT' is implicitly taken from template + content = [ + 'data/chap3/start.dlis.part', + 'data/chap3/template/invalid-repcode-no-value.dlis.part', + 'data/chap3/object/object.dlis.part', + ] + merge_files_oneLR(path, content) + with dlisio.load(path) as (f, *_): + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) + attr = obj.attic['INVALID'] + # note that behavior is different from the one below + # here we never process the attribute, hence error is not triggered + assert attr.value == None + + _ = obj['INVALID'] + assert_info("Invalid representation code 0") + +def test_repcode_invalid_in_template_no_value_empty(tmpdir, merge_files_oneLR, + assert_log, assert_info): + path = os.path.join(str(tmpdir), 'invalid-repcode-template-bad-empty.dlis') -def test_repcode_invalid_in_objects(tmpdir, merge_files_oneLR): + # template for attribute 'INVALID': + # repcode = 0 # invalid + # value = None # no value + + # attribute 'INVALID' in obj 'OBJECT' is explicitly taken from template + + # attribute 'INVALID' in obj 'OBJECT2': + # repcode = USHORT # valid + # value = [1, 2, 3, 4] # valid value + content = [ + 'data/chap3/start.dlis.part', + 'data/chap3/template/invalid-repcode-no-value.dlis.part', + 'data/chap3/object/object.dlis.part', + 'data/chap3/objattr/empty.dlis.part', + 'data/chap3/object/object2.dlis.part', + 'data/chap3/objattr/all-set.dlis.part', + ] + merge_files_oneLR(path, content) + with dlisio.load(path) as (f, *_): + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) + with pytest.raises(RuntimeError) as excinfo: + # note that behavior is different from the one above + # by adding "empty", we process the attribute, hence trigger error + _ = obj['INVALID'] + assert "invalid representation code" in str(excinfo.value) + + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT2', 1, 1) + attr = obj.attic['INVALID'] + assert attr.value == [1, 2, 3, 4] + + _ = obj['INVALID'] + assert_info( + "Problem: Invalid representation code 0\n" + "Where: T.VERY_MUCH_TESTY_SET-I.OBJECT2-O.1-C.1-A.INVALID") + +def test_repcode_invalid_in_objects_value(tmpdir, merge_files_oneLR): path = os.path.join(str(tmpdir), 'invalid-repcode-object.dlis') + + # template for attribute 'DEFAULT_ATTRIBUTE': + # repcode = FDOUBLE # valid + # value = [value1, value2] # valid + + # attribute 'DEFAULT_ATTRIBUTE' in obj 'OBJECT': + # repcode = 32 # invalid + # value = [bytes] # value is declared to be present content = [ 'data/chap3/start.dlis.part', 'data/chap3/template/default.dlis.part', 'data/chap3/object/object.dlis.part', - 'data/chap3/objattr/reprcode-invalid.dlis.part', + 'data/chap3/objattr/reprcode-invalid-value.dlis.part', ] merge_files_oneLR(path, content) @@ -301,8 +405,41 @@ def test_repcode_invalid_in_objects(tmpdir, merge_files_oneLR): f.load() assert "unknown representation code" in str(excinfo.value) -@pytest.mark.future_warning_repcode_different_no_value -def test_repcode_different_no_value(tmpdir, merge_files_oneLR): +def test_repcode_invalid_in_objects_no_value(tmpdir, merge_files_oneLR, + assert_log): + path = os.path.join(str(tmpdir), 'invalid-repcode-object-no-value.dlis') + + # template for attribute 'GLOBAL_DEFAULT_ATTRIBUTE' is empty + + # attribute 'GLOBAL_DEFAULT_ATTRIBUTE' in obj 'OBJECT': + # repcode = 32 # invalid + # value = None # no value is present + + # attribute 'GLOBAL_DEFAULT_ATTRIBUTE' in obj 'OBJECT2': + # repcode = USHORT # valid + # value = [1, 2, 3, 4] # valid value + content = [ + 'data/chap3/start.dlis.part', + 'data/chap3/template/global-default.dlis.part', + 'data/chap3/object/object.dlis.part', + 'data/chap3/objattr/reprcode-invalid-no-value.dlis.part', + 'data/chap3/object/object2.dlis.part', + 'data/chap3/objattr/all-set.dlis.part', + ] + merge_files_oneLR(path, content) + with dlisio.load(path) as (f, *_): + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) + with pytest.raises(RuntimeError) as excinfo: + _ = obj['GLOBAL_DEFAULT_ATTRIBUTE'] + assert "invalid representation code" in str(excinfo.value) + assert_log("value is not explicitly set") + + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT2', 1, 1) + attr = obj.attic['GLOBAL_DEFAULT_ATTRIBUTE'] + assert attr.value == [1, 2, 3, 4] + +def test_repcode_different_no_value(tmpdir, merge_files_oneLR, assert_log, + assert_info): path = os.path.join(str(tmpdir), 'different-repcode-no-value.dlis') content = [ 'data/chap3/start.dlis.part', @@ -314,7 +451,10 @@ def test_repcode_different_no_value(tmpdir, merge_files_oneLR): with dlisio.load(path) as (f, *_): obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) - assert obj.attic['DEFAULT_ATTRIBUTE'].value == [0j, 0j] + + attr = obj['DEFAULT_ATTRIBUTE'] + assert attr == [0j, 0j] + assert_log("value is not explicitly set") @pytest.mark.future_test_attributes def test_count0_novalue(tmpdir, merge_files_oneLR): @@ -371,8 +511,7 @@ def test_count0_different_repcode(tmpdir, merge_files_oneLR): assert attr.value == None -@pytest.mark.future_warning_label_bit_set_in_object_attr -def test_label_bit_set_in_attribute(tmpdir, merge_files_oneLR): +def test_label_bit_set_in_attribute(tmpdir, merge_files_oneLR, assert_log): path = os.path.join(str(tmpdir), 'label_bit_set_in_attribute.dlis') content = [ 'data/chap3/start.dlis.part', @@ -386,10 +525,11 @@ def test_label_bit_set_in_attribute(tmpdir, merge_files_oneLR): obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) assert obj.attic['DEFAULT_ATTRIBUTE'].value + _ = obj['DEFAULT_ATTRIBUTE'] + assert_log("Label bit") -@pytest.mark.future_warning_label_bit_not_set_in_template @pytest.mark.not_implemented_datetime_timezone -def test_label_bit_not_set_in_template(tmpdir, merge_files_oneLR): +def test_label_bit_not_set_in_template(tmpdir, merge_files_oneLR, assert_log): path = os.path.join(str(tmpdir), 'label-bit-not-set-in-template.dlis') content = [ 'data/chap3/start.dlis.part', @@ -403,10 +543,10 @@ def test_label_bit_not_set_in_template(tmpdir, merge_files_oneLR): attr = obj.attic['NEW_ATTRIBUTE'] dt = datetime(2033, 4, 19, 20, 39, 58, 103000) assert attr.value == [dt] + assert_log("Label not set in template") -@pytest.mark.future_warning_set_type_bit_not_set -def test_set_type_bit_not_set_in_set(tmpdir, merge_files_oneLR): +def test_set_type_bit_not_set_in_set(tmpdir, merge_files_oneLR, assert_log): path = os.path.join(str(tmpdir), 'set-type-not-set.dlis') content = [ 'data/chap3/sul.dlis.part', @@ -419,10 +559,13 @@ def test_set_type_bit_not_set_in_set(tmpdir, merge_files_oneLR): with dlisio.load(path) as (f, *tail): obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) assert obj.attic['DEFAULT_ATTRIBUTE'].value + assert_log("Problem: SET:type not set\n" + "Where: object set of type 'VERY_MUCH_TESTY_SET' " + "named ''") -@pytest.mark.future_warning_object_name_bit_not_set -def test_object_name_bit_not_set_in_object(tmpdir, merge_files_oneLR): +def test_object_name_bit_not_set_in_object(tmpdir, merge_files_oneLR, + assert_log): path = os.path.join(str(tmpdir), 'no-object-name-bit.dlis') content = [ 'data/chap3/start.dlis.part', @@ -434,10 +577,13 @@ def test_object_name_bit_not_set_in_object(tmpdir, merge_files_oneLR): with dlisio.load(path) as (f, *tail): obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) assert obj.attic['DEFAULT_ATTRIBUTE'].value + _ = obj['DEFAULT_ATTRIBUTE'] + assert_log("Problem: OBJECT:name was not set\n" + "Where: T.VERY_MUCH_TESTY_SET-I.OBJECT-O.1-C.1") @pytest.mark.future_test_attributes -def test_novalue_less_count(tmpdir, merge_files_oneLR): +def test_novalue_less_count(tmpdir, merge_files_oneLR, assert_log): path = os.path.join(str(tmpdir), 'novalue-less-count.dlis') content = [ 'data/chap3/start.dlis.part', @@ -455,21 +601,61 @@ def test_novalue_less_count(tmpdir, merge_files_oneLR): assert attr.units == 'default attr units' assert attr.value == [-0.75] + _ = obj['DEFAULT_ATTRIBUTE'] + assert_log("< template count") + -@pytest.mark.not_implemented def test_novalue_more_count(tmpdir, merge_files_oneLR): path = os.path.join(str(tmpdir), 'novalue-more-count.dlis') content = [ 'data/chap3/start.dlis.part', 'data/chap3/template/default.dlis.part', 'data/chap3/object/object.dlis.part', - 'data/chap3/objattr/count9-novalue.dlis.part' + 'data/chap3/objattr/count9-novalue.dlis.part', + 'data/chap3/object/object2.dlis.part', + 'data/chap3/objattr/all-set.dlis.part' ] merge_files_oneLR(path, content) with dlisio.load(path) as (f, *_): - with pytest.raises(NotImplementedError): - f.load() + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) + with pytest.raises(RuntimeError) as excinfo: + _ = obj['DEFAULT_ATTRIBUTE'] + assert "> template count" in str(excinfo.value) + + obj2 = f.object('VERY_MUCH_TESTY_SET', 'OBJECT2', 1, 1) + attr = obj2['DEFAULT_ATTRIBUTE'] + assert attr == [1, 2, 3, 4] + +def test_set_redundat(tmpdir, merge_files_oneLR, assert_info): + path = os.path.join(str(tmpdir), 'redundant-set.dlis') + content = [ + 'data/chap3/sul.dlis.part', + 'data/chap3/set/redundant.dlis.part', + 'data/chap3/template/default.dlis.part', + 'data/chap3/object/object.dlis.part', + 'data/chap3/objattr/all-set.dlis.part' + ] + merge_files_oneLR(path, content) + + with dlisio.load(path) as (f, *tail): + _ = f.object('REDUNDANT', 'OBJECT', 1, 1) + assert_info("Redundant sets are not supported") + +def test_set_replacement(tmpdir, merge_files_oneLR, assert_log): + path = os.path.join(str(tmpdir), 'replacement-set.dlis') + content = [ + 'data/chap3/sul.dlis.part', + 'data/chap3/set/replacement.dlis.part', + 'data/chap3/template/default.dlis.part', + 'data/chap3/object/object.dlis.part', + 'data/chap3/objattr/all-set.dlis.part' + ] + merge_files_oneLR(path, content) + + with dlisio.load(path) as (f, *tail): + _ = f.object('REPLACEMENT', 'OBJECT', 1, 1) + assert_log("Replacement sets are not supported") # these tests first of all verify findfdata method, which now belongs @@ -539,7 +725,8 @@ def test_findfdata_bad_obname(): # Obname is truncated by LRS-length, which in itself is fine as long as it # continues on the next LRS. However, in this case there are no new LRS. - assert 'File corrupted. Error on reading fdata obname' in str(excinfo.value) + msg = 'fdata record corrupted, error on reading obname' + assert msg in str(excinfo.value) def test_unexpected_attribute_in_set(tmpdir, merge_files_oneLR): path = os.path.join(str(tmpdir), 'unexpected-attribute.dlis') @@ -551,7 +738,7 @@ def test_unexpected_attribute_in_set(tmpdir, merge_files_oneLR): ] merge_files_oneLR(path, content) - with pytest.raises(ValueError) as excinfo: + with pytest.raises(RuntimeError) as excinfo: dlisio.load(path) assert "expected SET" in str(excinfo.value) @@ -568,7 +755,7 @@ def test_unexpected_set_in_object(tmpdir, merge_files_oneLR): merge_files_oneLR(path, content) with dlisio.load(path) as (f, *_): - with pytest.raises(ValueError) as excinfo: + with pytest.raises(RuntimeError) as excinfo: f.load() assert "expected ATTRIB" in str(excinfo.value) @@ -585,7 +772,7 @@ def test_unexpected_set_in_template(tmpdir, merge_files_oneLR): merge_files_oneLR(path, content) with dlisio.load(path) as (f, *_): - with pytest.raises(ValueError) as excinfo: + with pytest.raises(RuntimeError) as excinfo: f.load() assert "expected ATTRIB" in str(excinfo.value) @@ -602,7 +789,7 @@ def test_unexpected_attribute_instead_of_object(tmpdir, merge_files_oneLR): merge_files_oneLR(path, content) with dlisio.load(path) as (f, *_): - with pytest.raises(ValueError) as excinfo: + with pytest.raises(RuntimeError) as excinfo: f.load() assert "expected OBJECT" in str(excinfo.value) @@ -617,12 +804,13 @@ def test_cut_before_template(tmpdir, merge_files_oneLR): merge_files_oneLR(path, content) with dlisio.load(path) as (f, *_): - with pytest.raises(IndexError) as excinfo: + with pytest.raises(RuntimeError) as excinfo: f.load() assert "unexpected end-of-record" in str(excinfo.value) -def test_cut_before_object(tmpdir, merge_files_oneLR): +def test_cut_before_object(tmpdir, merge_files_oneLR, assert_debug): + # otherwise known as "set with no objects" test path = os.path.join(str(tmpdir), 'cut-before-object.dlis') content = [ 'data/chap3/start.dlis.part', @@ -632,6 +820,7 @@ def test_cut_before_object(tmpdir, merge_files_oneLR): with dlisio.load(path) as (f,): objects = f.find('.*', '.*') assert len(objects) == 0 + assert_debug("Set contains no objects") @pytest.mark.skip(reason="result inconsistent") diff --git a/python/tests/test_physical_layout.py b/python/tests/test_physical_layout.py index c48a5eb9e..4d2065754 100644 --- a/python/tests/test_physical_layout.py +++ b/python/tests/test_physical_layout.py @@ -42,11 +42,18 @@ def test_lrs_atributes_inconsistency(): _ = dlisio.load('data/chap2/attrs-inconsistency-type-pred.dlis') assert "inconsistency" in str(excinfo.value) -def test_padbytes_as_large_as_record(): +def test_logical_file_lrs_inconsistency(): + with pytest.raises(RuntimeError) as excinfo: + _ = dlisio.load('data/chap2/lf-lrs-inconsistency.dlis') + msg = "logical file, but last logical record segment expects successor" + assert msg in str(excinfo.value) + +def test_padbytes_as_large_as_record(assert_info): path = 'data/chap2/padbytes-large-as-record.dlis' with dlisio.load(path) as (f,): assert len(f.find('.*', '.*')) == 0 assert len(f.fdata_index) == 0 + assert_info("= logical record segment length") def test_padbytes_as_large_as_segment_explicit(): path = 'data/chap2/padbytes-large-as-seg-explict.dlis' @@ -91,21 +98,42 @@ def test_broken_vr(): _ = dlisio.load('data/chap2/incomplete-vr.dlis') assert "file may be corrupted" in str(excinfo.value) +def test_truncated_in_iflr(): + with pytest.raises(RuntimeError) as excinfo: + _ = dlisio.load('data/chap2/truncated-in-iflr.dlis') + assert "File truncated in Logical Record Segment" in str(excinfo.value) + def test_truncated_in_second_lr(): with pytest.raises(RuntimeError) as excinfo: _ = dlisio.load('data/chap2/truncated-in-second-lr.dlis') - assert "unexpected EOF when reading record" in str(excinfo.value) + assert "File truncated in Logical Record Segment" in str(excinfo.value) def test_truncated_in_lrsh(): with pytest.raises(RuntimeError) as excinfo: _ = dlisio.load('data/chap2/truncated-in-lrsh.dlis') assert "unexpected EOF when reading record" in str(excinfo.value) -def test_truncated_on_lrs(): +def test_truncated_in_lrsh_vr_over(): with pytest.raises(RuntimeError) as excinfo: - _ = dlisio.load('data/chap2/truncated-on-lrs.dlis') + _ = dlisio.load('data/chap2/truncated-in-lrsh-vr-over.dlis') + assert "File truncated in Logical Record Header" in str(excinfo.value) + +def test_truncated_after_lrsh(): + with pytest.raises(RuntimeError) as excinfo: + _ = dlisio.load('data/chap2/truncated-after-lrsh.dlis') + assert "File truncated in Logical Record Segment" in str(excinfo.value) + +def test_truncated_lr_missing_lrs_in_vr(): + with pytest.raises(RuntimeError) as excinfo: + _ = dlisio.load('data/chap2/truncated-lr-no-lrs-in-vr.dlis') assert "unexpected EOF when reading record" in str(excinfo.value) +def test_truncated_lr_missing_lrs_vr_over(): + with pytest.raises(RuntimeError) as excinfo: + _ = dlisio.load('data/chap2/truncated-lr-no-lrs-vr-over.dlis') + msg = "last logical record segment expects successor" + assert msg in str(excinfo.value) + def test_truncated_on_full_lr(): with pytest.raises(RuntimeError) as excinfo: _ = dlisio.load('data/chap2/truncated-on-full-lr.dlis') diff --git a/python/tests/test_tif.py b/python/tests/test_tif.py index 384e81fda..899905028 100644 --- a/python/tests/test_tif.py +++ b/python/tests/test_tif.py @@ -96,7 +96,7 @@ def test_layout_truncated_in_data(): fpath = 'data/tif/layout/truncated-in-data.dlis' with pytest.raises(RuntimeError) as excinfo: _ = dlisio.load(fpath) - assert "unexpected EOF" in str(excinfo.value) + assert "File truncated in Logical Record Segment" in str(excinfo.value) def test_layout_truncated_in_header(): fpath = 'data/tif/layout/truncated-in-header.dlis'