diff --git a/lib/extension/dlisio/ext/io.hpp b/lib/extension/dlisio/ext/io.hpp index d65a5b648..2f32b4c5c 100644 --- a/lib/extension/dlisio/ext/io.hpp +++ b/lib/extension/dlisio/ext/io.hpp @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -65,8 +66,9 @@ class stream { struct stream_offsets { - std::vector< long long > explicits; - std::vector< long long > implicits; + std::vector< index_entry > explicits; + std::vector< index_entry > implicits; + std::vector< index_entry > encrypted; }; stream open(const std::string&, std::int64_t) noexcept (false); @@ -80,10 +82,10 @@ bool hastapemark(stream&) noexcept (false); record extract(stream&, long long) noexcept (false); record& extract(stream&, long long, long long, record&) noexcept (false); -stream_offsets findoffsets(dl::stream&) noexcept (false); +dl::stream_offsets findoffsets(dl::stream&) noexcept (false); -std::vector< std::pair< std::string, long long > > -findfdata(dl::stream&, const std::vector< long long >&) noexcept (false); +std::map< std::string, std::vector< long long > > +findfdata(dl::stream&, const std::vector< index_entry >&) noexcept (false); } diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index e32db3960..9c8b3a630 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -68,6 +69,61 @@ enum class representation_code : std::uint8_t { undef = DLIS_UNDEF, }; +/* Parsing info + * + * These enum values are intended to be set _on_ the parsed objects created by + * dlisio's parsing routines. The values indicate how the object was parsed and + * potential inconsistencies, _recoverable_ errors or other relevant information found by + * the parsing routine. + * + * The enum values can be implicitly converted into std::error_code, which + * preserve the original value and offers a human readable description of the + * error. + * + * Each value is associated with a severity degree. + * + * Example: + * + * std::error_code code = dl::parsing_info:invar; + * if ( code == dl::parsing_severity::debug ) + * std::cout << err.message(); + * + * Output: + * "attr.invariant != 0" + * + */ +enum class parsing_info { + /* info */ absent_value = 1, // attr.count == 0 means absent value + /* warn */ dlisio_default = 2, // Value defaulted by dlisio + /* debug */ attr_invar = 3, // attr.invariant != 0 + /* debug */ attr_label = 4, // attr.label != 0 + /* debug */ noval_count = 5, // attr.count > 0, !attr.value + /* debug */ reprc_ne = 6, // attr.reprc != tmpl.reprc + /* debug */ count_eq = 7, // attr.count == tmpl.count + /* debug */ count_lt = 8, // attr.count < tmpl.count + /* debug */ count_gt = 9, // attr.count > tmpl.count + /* debug */ nodefault = 10, // no tmpl.value + /* warn */ reprc_invalid = 11, // unknown attr.reprc + /* info */ invariant = 12, // Invariant attribute +}; + +/* Construct an std::error_code object from dl::parsing_info. std::error_code's + * constructor uses augment-dependent lookup [1] to find and execute this + * function. This makes it possible for us to tell the constructor how to + * cast from dl::parsing_info. + * + * [1] https://en.wikipedia.org/wiki/Argument-dependent_name_lookup + */ +std::error_code make_error_code(parsing_info); + +enum class parsing_severity { + info = 1, // Confirmation that things are working as expected + debug = 2, // Information that may be interesting when debugging + warning = 3, // Information that may be interesting to anyone +}; + +std::error_condition make_error_condition(parsing_severity); + /* * It's _very_ often necessary to access the raw underlying type of the strong * type aliases for comparisons, literals, or conversions. dl::decay inspects @@ -392,6 +448,24 @@ using value_vector = mpark::variant< /* * The structure of an attribute as described in 3.2.2.1 + * + * Error handling: + * + * Due to the intricate structure of a dlis-file, dlisio typically over-parse + * when a certain pieces of information is queried. This would typically makes + * any warnings or errors issued from the parsing routines pre-mature and might + * result in the query failing due to an error in some (from a + * user-perspective) unrelated data. + * + * To combat this, the result of parsing routines (and the state of the + * parsed object) is communicated through error_codes set on the object. + * + * It is the consumers responsibility of checking the state of the + * object before using it's content. + * + * object_attribute.errcodes contains a list of enumerated values from + * dl::parsing_info. These can be implicitly translated into std::error_code + * objects which offers human readable descriptions of the error(s). */ struct object_attribute { dl::ident label = {}; @@ -400,7 +474,8 @@ struct object_attribute { representation_code reprc = representation_code::ident; dl::units units = {}; dl::value_vector value = {}; - bool invariant = false; + + std::vector< dl::parsing_info > info; bool operator == (const object_attribute& ) const noexcept (true); }; @@ -459,24 +534,91 @@ struct basic_object { bool operator != (const basic_object&) const noexcept (true); dl::obname object_name; + dl::ident type; std::vector< object_attribute > attributes; }; -/* - * The object set, after parsing, is an (unordered?) collection of objects. In - * parsing, more information is added through creating custom types, but the - * fundamental restriction is one type per set. +/* Object set + * + * The object SET, as defined by rp66v1 chapter 3.2.1 is a series of objects - + * all derived from the same object template, all of the same type. + * + * Because the pieces of information making up the objects are mostly variable + * size there is random access to get specific objects. For the same reason + * there is no way of making an index of the objects without parsing the full + * set. Hence the entire set need to be parsed and cached in one go. + * + * However, parsing a lot of sets are expensive and often unnecessary. Too + * avoid the upfront cost of parsing, object_set is a self parsing type. I.e. + * it is initialized with a buffer of raw bytes making up the set - which is + * comparably much cheaper to extract than the actual parsing. The parsing is + * considered an implementation detail of the class and will be postponed until + * the first outside query for objects. + * + * Typical object queries will revolve around the type of object - hence + * parsing the set type (and name) independently of the rest of the set makes + * sense. * - * The variant-of-vectors is wells suited for this + * Caching the raw bytes on the object also makes it independent of IO. + * + * Encrypted Records: + * + * encrypted records cannot be parsed by dlisio without being decrypted first. + * As object_set does its parsing itself, it _will_ fail on construction if + * given an encrypted record. */ using object_vector = std::vector< basic_object >; struct object_set { +public: + explicit object_set( std::vector< char > b ) noexcept (false); + int role; // TODO: enum class? dl::ident type; dl::ident name; + + void parse() noexcept (false); + bool isparsed() const noexcept (true); + + dl::basic_object& at(const dl::obname&) noexcept (false); + dl::object_vector& objects() noexcept (false); +private: + std::vector< char > buffer; + dl::object_vector objs; dl::object_template tmpl; - dl::object_vector objects; + + bool parsed = false; +}; + +struct matcher { + virtual bool match(const dl::ident& pattern, const dl::ident& candidate) + const noexcept (false) = 0; + virtual ~matcher() = default; +}; + +struct exactmatch : public matcher { + bool match(const dl::ident& pattern, const dl::ident& candidate) + const noexcept (false) override; +}; + +/* A queryable pool of metadata objects */ +class pool { +public: + explicit pool( std::vector< dl::object_set > e ) : eflrs(std::move(e)) {}; + + std::vector< dl::ident > types() const noexcept (false); + + object_vector match(const std::string& type, + const std::string& name, + const dl::matcher& matcher); + + object_vector match(const std::string& type, + const dl::matcher& matcher) noexcept (false); + + dl::basic_object& at(const dl::ident&, const dl::obname&) noexcept (false); + dl::basic_object& at(const dl::objref&) noexcept(false); +private: + std::vector< dl::object_set > eflrs; }; const char* parse_template( const char* begin, @@ -491,6 +633,49 @@ const char* parse_set_component( const char*, ident*, int* ) noexcept (false); +/** index_entry - The whereabouts of a Logical Record + * + * index_entry does not contain the data from the Logical Record Segment + * Bodies. Rather, it contains the position of a Logical Record within a file, + * the Logical Record Segment Header attributes - and code (logical record + * type). + * + * consistency indicates if the header information code and attributes are + * consistent across all Logical Record Segment Headers. + * + * I.e. code should hold the same value in all lrsh. While all the bits in + * attributes should be the same across all lrsh, with exception of the + * predecessor and successor bits which should have the following structure: + * + * lrsh predecessor successor + * 0 0 1 + * 1 1 1 + * .. + * n 1 0 + */ +struct index_entry { + long long tell; + std::uint8_t code; // Logical Record Type - Appendix A + std::uint8_t attributes; + bool consistent; + + bool isexplicit() const noexcept (true); + bool isencrypted() const noexcept (true); +}; + +} + +namespace std { + /* Register parsing_info for implicit conversion to std::error_code */ + template <> + struct is_error_code_enum + : public true_type {}; + + /* Register parsing_severity for implicit conversion to std::error_condition + */ + template <> + struct is_error_condition_enum + : public true_type {}; } #endif //DLISIO_EXT_TYPES_HPP diff --git a/lib/src/io.cpp b/lib/src/io.cpp index 706626c0a..468375d44 100644 --- a/lib/src/io.cpp +++ b/lib/src/io.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -156,6 +157,14 @@ bool record::isencrypted() const noexcept (true) { return this->attributes & DLIS_SEGATTR_ENCRYPT; } +bool index_entry::isexplicit() const noexcept (true) { + return this->attributes & DLIS_SEGATTR_EXFMTLR; +} + +bool index_entry::isencrypted() const noexcept (true) { + return this->attributes & DLIS_SEGATTR_ENCRYPT; +} + namespace { template < typename T > @@ -387,62 +396,87 @@ record& extract(stream& file, long long tell, long long bytes, record& rec) noex } } -stream_offsets findoffsets( dl::stream& file) noexcept (false) { - stream_offsets ofs; +dl::stream_offsets findoffsets(dl::stream& file) noexcept (false) { + dl::stream_offsets idx; + shortvec< std::uint8_t > attributes; + shortvec< int > types; + bool consistent = true; - std::int64_t offset = 0; + std::int64_t offset_segment = 0; + std::int64_t offset_record = offset_segment; char buffer[ DLIS_LRSH_SIZE ]; int len = 0; while (true) { - file.seek(offset); + file.seek(offset_segment); file.read(buffer, DLIS_LRSH_SIZE); - if (file.eof()) + if (file.eof()) { + // TODO: check attributes and types -> should be empty break; + } int type; std::uint8_t attrs; dlis_lrsh( buffer, &len, &attrs, &type ); + attributes.push_back( attrs ); + types.push_back( type ); + int isexplicit = attrs & DLIS_SEGATTR_EXFMTLR; + + // Start of a new record if (not (attrs & DLIS_SEGATTR_PREDSEG)) { - if (isexplicit and type == 0 and ofs.explicits.size()) { + if (isexplicit and type == 0 and idx.explicits.size()) { /* * Wrap up when we encounter a EFLR of type FILE-HEADER that is - * NOT the first Logical Record. More precisely we expect the + * NOT the first Logical offset_record. More precisely we expect the * _first_ LR we encounter to be a FILE-HEADER. We gather up * this LR and all successive LR's until we encounter another * FILE-HEADER. */ - file.seek( offset ); + file.seek( offset_segment ); break; } - if (isexplicit) ofs.explicits.push_back( offset ); - /* - * Consider doing fdata-indexing on the fly as we are now at the - * correct offset to read the OBNAME. That would mean we only need - * to traverse the file a single time to index it. Additionally it - * would make the caller code from python way nicer. - */ - else ofs.implicits.push_back( offset ); + offset_record = offset_segment; + } + + offset_segment += len; + + // We reached the last LRS in the current LR - check consistency and + // add to index + if (not (attrs & DLIS_SEGATTR_SUCCSEG)) { + index_entry entry; + entry.tell = offset_record; + entry.code = types.front(); + entry.attributes = attributes.front(); + entry.consistent = consistent; + + if (not attr_consistent( attributes )) entry.consistent = false; + if (not type_consistent( types )) entry.consistent = false; + + if (entry.isencrypted()) idx.encrypted.push_back(entry); + else if (entry.isexplicit()) idx.explicits.push_back(entry); + else idx.implicits.push_back(entry); + + attributes.clear(); + types.clear(); } - offset += len; } - return ofs; + return idx; } -std::vector< std::pair< std::string, long long > > -findfdata(dl::stream& file, const std::vector< long long >& tells) +std::map< std::string, std::vector< long long > > +findfdata(dl::stream& file, const std::vector< index_entry >& index) noexcept (false) { - std::vector< std::pair< std::string, long long > > xs; + std::map< std::string, std::vector< long long > > xs; constexpr std::size_t OBNAME_SIZE_MAX = 262; record rec; rec.data.reserve( OBNAME_SIZE_MAX ); - for (auto tell : tells) { - extract(file, tell, OBNAME_SIZE_MAX, rec); + for (auto iflr : index) { + extract(file, iflr.tell, OBNAME_SIZE_MAX, rec); if (rec.isencrypted()) continue; if (rec.type != 0) continue; if (rec.data.size() == 0) continue; @@ -461,7 +495,19 @@ noexcept (false) { dl::obname tmp{ dl::origin{ origin }, dl::ushort{ copy }, dl::ident{ std::string{ id, id + idlen } } }; - xs.emplace_back(tmp.fingerprint("FRAME"), tell); + std::string fp = tmp.fingerprint("FRAME"); + + /* Although index.size() often is fairly large, the number of + * unique frames are typically just an handful. Hence the repeated + * calls to find() are not so bad as it might seem at first glace. + */ + const auto itr = xs.find(fp); + if (itr == xs.end()) { + xs.insert( { fp, { iflr.tell } }); + } + else { + itr->second.push_back( iflr.tell ); + } } return xs; } diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index 2041a0e11..b5fcc9d68 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -3,6 +3,8 @@ #include #include #include +#include +#include #include @@ -11,6 +13,112 @@ namespace { +struct parsing_info_cat : std::error_category { + const char* name() const noexcept (true) override; + std::string message(int ev) const override; +}; + +struct parsing_sev_cat : std::error_category { + const char* name() const noexcept (true) override; + std::string message(int ev) const override; + + bool equivalent( const std::error_code& code, int condition) + const noexcept (true) override; +}; + +const char* parsing_info_cat::name() const noexcept (true) { + return "parsing info"; +} + +std::string parsing_info_cat::message(int ev) const +{ + using attr = dl::parsing_info; + switch (static_cast(ev)) { + case attr::absent_value: + return "attr.count==0, value is undefined"; + case attr::invariant: + return "Invariant attribute, using default value from template"; + case attr::dlisio_default: + return "value is defaulted by dlisio, see debug info"; + case attr::attr_invar: + return "attr.invariant != 0, ignoring"; + case attr::attr_label: + return "attr.label != 0, ignoring"; + case attr::noval_count: + return "!attr.value, attr.count > 0"; + case attr::reprc_ne: + return "attr.reprc != templ.reprc"; + case attr::count_eq: + return "attr.count == tmpl.count, using tmpl.value"; + case attr::count_lt: + return "attr.count < tmpl.count, using tmpl.value"; + case attr::count_gt: + return "attr.count > tmpl.count, using tmpl.value"; + case attr::nodefault: + return "!tmpl.value, using empty value with type attr.reprc"; + case attr::reprc_invalid: + return "attr.reprc invalid, using monostate as value"; + default: + return "unrecognised parsing infocode"; + } +} + +const char* parsing_sev_cat::name() const noexcept (true) { + return "Parsing info severity"; +} + +std::string parsing_sev_cat::message(int ev) const { + switch (static_cast< dl::parsing_severity >(ev)) { + case dl::parsing_severity::info: + return "Confirmation that things are working as expected"; + case dl::parsing_severity::debug: + return "Information that may be interesting when debugging"; + case dl::parsing_severity::warning: + default: + return "unrecognised severity code"; + } +} + +bool parsing_sev_cat::equivalent( const std::error_code& code, int condition ) +const noexcept (true) { + using attr = dl::parsing_info; + switch ( static_cast< dl::parsing_severity >(condition) ){ + case dl::parsing_severity::info: + return code == attr::absent_value + || code == attr::invariant; + + case dl::parsing_severity::warning: + return code == attr::dlisio_default + || code == attr::reprc_invalid; + + case dl::parsing_severity::debug: + return code == attr::attr_invar + || code == attr::attr_label + || code == attr::noval_count + || code == attr::reprc_ne + || code == attr::count_eq + || code == attr::count_lt + || code == attr::count_gt + || code == attr::nodefault; + default: + return false; + } +} + +/* The identity of an category is determined by it's address. Following the + * example set by the standard we provide a function that always returns a + * reference to the same object. + */ +const std::error_category& parsing_info_category() { + static parsing_info_cat instance; + return instance; +} + +const std::error_category& parsing_severity_category() { + static parsing_sev_cat instance; + return instance; +} + void user_warning( const std::string& ) noexcept (true) { // TODO: } @@ -540,6 +648,12 @@ struct variant_equal { return false; } + bool operator () (mpark::monostate, + mpark::monostate) + const noexcept (true) { + return true; + } + template < typename T > bool operator () (const std::vector< T >& lhs, const std::vector< T >& rhs) @@ -559,12 +673,21 @@ noexcept (true) { namespace dl { +std::error_code make_error_code(parsing_info e) { + return std::error_code( static_cast(e), parsing_info_category()); +} + +std::error_condition make_error_condition(parsing_severity e) { + return std::error_condition( static_cast(e), parsing_severity_category() ); +} + bool object_attribute::operator == (const object_attribute& o) const noexcept (true) { return this->label == o.label && this->count == o.count && this->reprc == o.reprc && this->units == o.units + && this->info == o.info // invariant doesn't matter for attribute equality, // so ignore it && value_variant_eq(this->value, o.value); @@ -674,6 +797,7 @@ bool basic_object::operator != (const basic_object& o) const noexcept (true) { return !(*this == o); } + const char* parse_template( const char* cur, const char* end, object_template& out ) noexcept (false) { @@ -718,7 +842,8 @@ const char* parse_template( const char* cur, if (flags.value) cur = elements( cur, attr.count, attr.reprc, attr.value ); - attr.invariant = flags.invariant; + if (flags.invariant) + attr.info.push_back(dl::parsing_info::invariant); tmp.push_back( std::move( attr ) ); @@ -768,7 +893,8 @@ struct shrink { void patch_missing_value( dl::value_vector& value, std::size_t count, - dl::representation_code reprc ) + dl::representation_code reprc, + std::vector< dl::parsing_info >& info) noexcept (false) { /* @@ -778,11 +904,15 @@ noexcept (false) if (!mpark::holds_alternative< mpark::monostate >(value)) { const auto size = mpark::visit( len(), value ); /* same size, so return */ - if (size == count) return; + if (size == count) { + info.push_back(dl::parsing_info::count_eq); + return; + } /* smaller, shrink and all is fine */ if (size > count) { mpark::visit( shrink( count ), value ); + info.push_back(dl::parsing_info::count_lt); return; } @@ -791,6 +921,7 @@ noexcept (false) * exception and consider what to do when a file actually uses this * behaviour */ + //TODO use dl::parsing_info::count_gt instead of throw const auto msg = "object attribute without no explicit value, but " "count (which is {}) > size (which is {})" ; @@ -806,6 +937,7 @@ noexcept (false) * making this switch work in the general case */ + info.push_back(dl::parsing_info::nodefault); using rpc = dl::representation_code; switch (reprc) { case rpc::fshort: reset< dl::fshort >(value).resize(count); return; @@ -836,15 +968,14 @@ noexcept (false) case rpc::status: reset< dl::status >(value).resize(count); return; case rpc::units: reset< dl::units >(value).resize(count); return; default: { - const auto msg = "unable to patch attribute with no value: " - "unknown representation code {}"; - const auto code = static_cast< int >(reprc); - throw std::runtime_error(fmt::format(msg, code)); + info.push_back(dl::parsing_info::reprc_invalid); + value = mpark::monostate{}; } } } object_vector parse_objects( const object_template& tmpl, + const dl::ident type, const char* cur, const char* end ) noexcept (false) { @@ -859,10 +990,15 @@ object_vector parse_objects( const object_template& tmpl, cur += DLIS_DESCRIPTOR_SIZE; auto current = default_object; + current.type = type; if (object_flags.name) cur = cast( cur, current.object_name ); for (const auto& template_attr : tmpl) { - if (template_attr.invariant) continue; + auto invariant = std::find( template_attr.info.begin(), + template_attr.info.end(), + dl::parsing_info::invariant ) + != template_attr.info.end(); + if (invariant) continue; if (cur == end) break; const auto flags = parse_attribute_descriptor( cur ); @@ -891,12 +1027,11 @@ object_vector parse_objects( const object_template& tmpl, * Assume this is a mistake, assume it was a regular * non-invariant attribute */ - user_warning("ATTRIB:invariant in attribute, " - "but should only be in template"); + attr.info.push_back(dl::parsing_info::attr_invar); } if (flags.label) { - user_warning( "ATTRIB:label set, but must be null"); + attr.info.push_back(dl::parsing_info::attr_label); } if (flags.count) cur = cast( cur, attr.count ); @@ -917,6 +1052,7 @@ object_vector parse_objects( const object_template& tmpl, */ if (count == 0) { attr.value = mpark::monostate{}; + attr.info.push_back(dl::parsing_info::absent_value); } else if (!flags.value) { /* * Count is non-zero, but there's no value for this attribute. @@ -929,16 +1065,17 @@ object_vector parse_objects( const object_template& tmpl, * TODO: in the future it's possible to allow promotion between * certain codes (ident -> ascii), but is no need for now */ - + attr.info.push_back(dl::parsing_info::noval_count); if (flags.reprc && attr.reprc != template_attr.reprc) { - const auto msg = "count ({}) isn't 0 and representation " - "code ({}) changed, but value is not explicitly set"; - const auto code = static_cast< int >(attr.reprc); - user_warning(fmt::format(msg, count, code)); attr.value = mpark::monostate{}; + attr.info.push_back(dl::parsing_info::reprc_ne); } - patch_missing_value( attr.value, count, attr.reprc ); + patch_missing_value( attr.value, + count, + attr.reprc, + attr.info ); + attr.info.push_back(dl::parsing_info::dlisio_default); } current.set(attr); @@ -987,19 +1124,137 @@ const char* parse_set_component( const char* cur, return cur; } -object_set parse_objects( const char* cur, const char* end ) { - object_set set; - cur = parse_set_component( cur, end, &set.type, &set.name, &set.role); - cur = parse_template( cur, end, set.tmpl ); +bool exactmatch::match(const dl::ident& pattern, const dl::ident& candidate) +const noexcept (false) { + return pattern == candidate; +} - /* - Return if set has no objects - */ + +object_set::object_set(std::vector< char > b) noexcept (false) { + parse_set_component(b.data(), + b.data() + b.size(), + &this->type, + &this->name, + &this->role); + this->buffer = std::move(b); +} + +void object_set::parse() noexcept (false) { + if (this->isparsed()) return; + + const char* beg = this->buffer.data(); + const char* end = beg + this->buffer.size(); + + /* Skip past the set component as it's already been read and parsed */ + auto cur = parse_set_component(beg, end, nullptr, nullptr, nullptr); + + object_template tmpl; + cur = parse_template(cur, end, tmpl); + + // There are no objects in the set if (std::distance( cur, end ) == 0) - return set; + return; + + auto objs = parse_objects(tmpl, this->type, cur, end); + + this->tmpl = tmpl; + this->objs = objs; - set.objects = parse_objects( set.tmpl, cur, end ); - return set; + this->parsed = true; +} + +bool object_set::isparsed() const noexcept (true) { + return this->parsed; +} + +dl::object_vector& object_set::objects() noexcept (false) { + if (not this->isparsed()) + this->parse(); + + return this->objs; +} + +dl::basic_object& object_set::at(const dl::obname& key) noexcept (false) { + auto eq = [&key]( const dl::basic_object& obj ) { + return dl::decay( obj.object_name ) == key; + }; + + auto objects = this->objects(); + // TODO; handle duplications + auto itr = std::find_if( objects.begin(), + objects.end(), + eq ); + + if (itr == objects.end()) { + const auto msg = "object_set.at: No object with fingerprint {} in set"; + const auto fp = key.fingerprint( dl::decay(this->type) ) ; + throw std::out_of_range( fmt::format(msg, fp)); + } + return *itr; +} + +std::vector< dl::ident > pool::types() const noexcept (false) { + std::vector< dl::ident > types; + for (auto eflr : this->eflrs) { + types.push_back( eflr.type ); + } + return types; +} + +dl::basic_object& pool::at(const dl::ident& type, const dl::obname& name) +noexcept (false) { + // TODO A more clever search + dl::basic_object tmp; + for (auto& eflr : this->eflrs) { + if (eflr.type != type) continue; + + /* There might be multiple EFLR's with the correct type, so ignore + * index error while there are still more eflr's to check. + */ + try { + // TODO handle duplications + return eflr.at(name); + } catch (const std::out_of_range&) {} + } + + const auto msg = "pool.at: No object with fingerprint {}"; + const auto fp = name.fingerprint( dl::decay(type) ) ; + throw std::out_of_range( fmt::format(msg, fp)); +} + +dl::basic_object& pool::at(const dl::objref& id ) noexcept (false) { + return this->at(id.type, id.name); +} + +object_vector pool::match( const std::string& type, + const std::string& name, + const dl::matcher& m) +noexcept (false) { + object_vector objs; + + for (auto& eflr : this->eflrs) { + if (not m.match(dl::ident{type}, eflr.type)) continue; + + for (const auto& obj : eflr.objects()) { + if (not m.match(dl::ident{name}, obj.object_name.id)) continue; + + objs.push_back(obj); + } + } + return objs; +} + +object_vector pool::match( const std::string& type, + const dl::matcher& m) +noexcept (false) { + object_vector objs; + + for (auto& eflr : this->eflrs) { + if (not m.match(dl::ident{type}, eflr.type)) continue; + auto tmp = eflr.objects(); + objs.insert(objs.end(), tmp.begin(), tmp.end()); + } + return objs; } } diff --git a/python/data/chap3/object/broken-utf8-object.dlis.part b/python/data/chap3/object/broken-utf8-object.dlis.part index 669210798..bee27e792 100644 Binary files a/python/data/chap3/object/broken-utf8-object.dlis.part and b/python/data/chap3/object/broken-utf8-object.dlis.part differ diff --git a/python/data/chap4-7/README.rst b/python/data/chap4-7/README.rst index 7dfda70c5..e719a19eb 100644 --- a/python/data/chap4-7/README.rst +++ b/python/data/chap4-7/README.rst @@ -140,6 +140,9 @@ Remaining files ================================================ ================================================== Filename Description ================================================ ================================================== +frame-channels.dlis Contains frames with channels with specific + repcodes and signatures for curves testing + invalid-date-in-origin.dlis Simple file which contains invalid creation time attribute in origin @@ -151,4 +154,7 @@ many-logical-files-error-in-last.dlis Contains several logical files, many-logical-files-same-object.dlis Contains 2 logical files with the same objects and encrypted records +match.dlis Various combinations of objects to test match + functions as they require specific name patterns + ================================================ ================================================== diff --git a/python/data/chap4-7/frame-channels.dlis b/python/data/chap4-7/frame-channels.dlis new file mode 100644 index 000000000..62480e596 Binary files /dev/null and b/python/data/chap4-7/frame-channels.dlis differ diff --git a/python/data/chap4-7/match.dlis b/python/data/chap4-7/match.dlis new file mode 100644 index 000000000..5fd71e8e9 Binary files /dev/null and b/python/data/chap4-7/match.dlis differ diff --git a/python/dlisio/__init__.py b/python/dlisio/__init__.py index 3bd959f31..2f45b788b 100644 --- a/python/dlisio/__init__.py +++ b/python/dlisio/__init__.py @@ -215,22 +215,19 @@ class dlis(object): to link the content of attributes to other objects. """ - def __init__(self, stream, attic, fdata_index, sul=None): + def __init__(self, stream, metadata, fdata_index, encrypted, sul=None): self.file = stream - self.attic = attic + self.metadata = metadata self.sul = sul self.fdata_index = fdata_index + self.encrypted = encrypted + # Use python's re with case-insensitivity as matcher when searching for + # metadata objects in dl::pool + self.matcher = plumbing.regex_matcher(re.IGNORECASE); - self.indexedobjects = defaultdict(dict) - self.problematic = [] + #TODO: deal with 'problematic' - self.record_types = core.parse_set_types(self.attic) - - types = ('FILE-HEADER', 'ORIGIN', 'FRAME', 'CHANNEL') - recs = [rec for rec, t in zip(self.attic, self.record_types) if t in types] - self.load(recs) - - if 'UPDATE' in self.record_types: + if self.metadata.match('UPDATE', '.*', self.matcher): msg = ('{} contains UPDATE-object(s) which changes other ' 'objects. dlisio lacks support for UPDATEs, hence the ' 'data in this logical file might be wrongly presented.') @@ -255,18 +252,8 @@ def __getitem__(self, type): objects : dict all objects of type 'type' """ - if type in self.indexedobjects: - return self.indexedobjects[type] - - recs = [rec - for rec, t - in zip(self.attic, self.record_types) - if t == type - ] - - self.load(recs, reload=False) - - return self.indexedobjects[type] + objs = self.metadata.match(type, core.exactmatch()) + return { x.fingerprint : x for x in self.promote(objs) } def __enter__(self): return self @@ -361,19 +348,12 @@ def unknowns(self): A defaultdict index by object-type """ - recs = [rec for rec, t in zip(self.attic, self.record_types) - if t not in self.types - and not rec.encrypted - and t not in self.indexedobjects - ] - - self.load(recs, reload=False) - - unknowns = defaultdict(list) - - for key, value in self.indexedobjects.items(): - if key in self.types: continue - unknowns[key] = value + unknowns = defaultdict(dict) + for t in self.metadata.types: + if t in unknowns: continue + #TODO handle duplicated types in dl::pool.types + if t in self.types: continue + unknowns[t] = self[t] return unknowns @@ -454,22 +434,8 @@ def match(self, pattern, type="CHANNEL"): Channel(CHANNEL123) """ - def compileregex(pattern): - try: - return re.compile(pattern, re.IGNORECASE) - except: - msg = 'Invalid regex: {}'.format(pattern) - raise ValueError(msg) - - ctype = compileregex(type) - cpattern = compileregex(pattern) - - types = [x for x in set(self.record_types) if re.match(ctype, x)] - - for t in types: - for obj in self[t].values(): - if not re.match(cpattern, obj.name): continue - yield obj + matches = self.metadata.match(type, pattern, self.matcher) + return self.promote(matches) def object(self, type, name, origin=None, copynr=None): """ @@ -503,27 +469,32 @@ def object(self, type, name, origin=None, copynr=None): MKAP """ - if origin is None or copynr is None: - obj = list(self.match('^'+name+'$', type)) - if len(obj) == 1: - return obj[0] - elif len(obj) == 0: - msg = "No objects with name {} and type {} are found" - raise ValueError(msg.format(name, type)) - elif len(obj) > 1: - msg = "There are multiple {}s named {}. Found: {}" - desc = "" - for o in obj: - desc += ("(origin={}, copy={}), " - .format(o.origin, o.copynumber)) - raise ValueError(msg.format(type, name, desc)) + matches = self.metadata.match(type, name, core.exactmatch()) + matches = self.promote(matches) + + if origin is not None: + matches = [o for o in matches if o.origin == origin] + + if copynr is not None: + matches = [o for o in matches if o.copynumber == copynr] + + if len(matches) == 1: return matches[0] + + # Figure out what went wrong + elif len(matches) > 1: + msg = "There are multiple {}s named {}. Found: {}" + desc = "" + for o in matches: + desc += ("(origin={}, copy={}), " + .format(o.origin, o.copynumber)) + raise ValueError(msg.format(type, name, desc)) + + if origin is not None and copynr is not None: + msg = "Object {}.{}.{} of type {} is not found" + raise ValueError(msg.format(name, origin, copynr, type)) else: - fingerprint = core.fingerprint(type, name, origin, copynr) - try: - return self[type][fingerprint] - except KeyError: - msg = "Object {}.{}.{} of type {} is not found" - raise ValueError(msg.format(name, origin, copynr, type)) + msg = "No objects with name {} and type {} are found" + raise ValueError(msg.format(name, type)) def describe(self, width=80, indent=''): """Printable summary of the logical file @@ -554,9 +525,7 @@ def describe(self, width=80, indent=''): plumbing.describe_dict(buf, d, width, indent) known, unknown = {}, {} - for objtype in set(self.record_types): - if objtype == 'encrypted': continue - + for objtype in self.metadata.types: if objtype in self.types: known[objtype] = len(self[objtype]) else: unknown[objtype] = len(self[objtype]) @@ -570,95 +539,19 @@ def describe(self, width=80, indent=''): return plumbing.Summary(info=buf.getvalue()) - def load(self, records=None, reload=True): - """ Load and enrich raw objects into the object pool + def load(self): + """ Force load all metadata - mainly indended for debugging""" + _ = [self[x] for x in self.metadata.types] - This method converts the raw object sets into first-class dlisio python - objects, and puts them in the indexedobjects and problematic members. - - Parameters - ---------- - - records : iterable of records - - reload : bool - If False, append the new object too the pool of objects. If True, - overwrite the existing pool with new objects. - - Notes - ----- - - This method is mainly intended for internal use. It serves as a worker - for other methods that needs to populate the pool with new objects. - It's the callers responsibility to keep track of the current state of - the pool, and not load the same objects into the pool several times. - - Examples - -------- - - When opening a file with dlisio.load('path') only a few object-types - are loaded into the pool. If need be, it is possible to force dlisio to - load every object in the file into its internal cache: - - >>> with dlisio.load('file') as (f, *tail): - ... f.load() - - """ - problem = 'multiple distinct objects ' - where = 'in set {} ({}). Duplicate fingerprint = {}' - action = 'continuing with the last object' - duplicate = 'duplicate fingerprint {}' - - if reload: - indexedobjects = defaultdict(dict) - problematic = [] - - else: - indexedobjects = self.indexedobjects - problematic = self.problematic - - if records is None: records = self.attic - sets = core.parse_objects(records) - - for os in sets: - # TODO: handle replacement sets - for name, o in os.objects.items(): - try: - obj = self.types[os.type](o, name = name, lf = self) - except KeyError: - obj = plumbing.Unknown( - o, - name = name, - type = os.type, - lf = self - ) - - fingerprint = obj.fingerprint - if fingerprint in indexedobjects[os.type]: - original = indexedobjects[os.type][fingerprint] - - logging.info(duplicate.format(fingerprint)) - if original.attic != obj.attic: - msg = problem + where - msg = msg.format(os.type, os.name, fingerprint) - logging.error(msg) - logging.warning(action) - problematic.append((original, obj)) - - indexedobjects[obj.type][fingerprint] = obj - - - self.indexedobjects = indexedobjects - self.problematic = problematic - - if 'FRAME' not in [x.type for x in sets]: return self - - # Frame objects need the additional step of updating its Channels with - # a reference back to itself. See Frame.link() - for obj in self.indexedobjects['FRAME'].values(): - obj.link() - - return self + def promote(self, objects): + objs = [] + for o in objects: + try: + obj = self.types[o.type](o, name=o.name, lf=self) + except KeyError: + obj = plumbing.Unknown(o, name=o.name, type=o.type, lf=self) + objs.append(obj) + return objs def storage_label(self): """Return the storage label of the physical file @@ -790,15 +683,13 @@ def rewind(offset, tif): if tapemarks: stream = core.open_tif(stream) stream = core.open_rp66(stream) - explicits, implicits = core.findoffsets(stream) + explicits, implicits, encrypted = core.findoffsets(stream) hint = rewind(stream.absolute_tell, tapemarks) - records = core.extract(stream, explicits) - fdata_index = defaultdict(list) - for key, val in core.findfdata(stream, implicits): - fdata_index[key].append(val) + metadata = core.metadata_pool(stream, explicits) + fdata_index = core.findfdata(stream, implicits) - lf = dlis(stream, records, fdata_index, sul) + lf = dlis(stream, metadata, fdata_index, encrypted, sul) lfs.append(lf) try: diff --git a/python/dlisio/dlisutils.py b/python/dlisio/dlisutils.py index f1adc5b5a..de932657b 100644 --- a/python/dlisio/dlisutils.py +++ b/python/dlisio/dlisutils.py @@ -10,7 +10,11 @@ def curves(dlis, frame, dtype, pre_fmt, fmt, post_fmt): Reads curves for provided frame and position defined by frame format: pre_fmt (to skip), fmt (to read), post_fmt (to skip) """ - indices = dlis.fdata_index[frame.fingerprint] + try: + indices = dlis.fdata_index[frame.fingerprint] + except KeyError: + indices = list() + alloc = lambda size: np.empty(shape = size, dtype = dtype) return core.read_fdata( pre_fmt, diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index c22c94fab..05e7c4e59 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -688,8 +689,56 @@ noexcept (false) { return dstobj; } +/** trampoline helper class for dl::matcher bindings + * + * Creating the binding code for a abstract c++ class that we want do derive + * new classes from in python requires some extra boilerplate code in the form + * of this "trampoline" class [1]. + * + * This class helps redirect virtual calls back to python and is *not* intended + * to be used for anything other than creating valid bindings for dl::matcher. + * + * [1] https://pybind11.readthedocs.io/en/stable/advanced/classes.html#overriding-virtual-functions-in-python + */ +class Pymatcher : public dl::matcher { +public: + /* Inherit the constructor */ + using dl::matcher::matcher; + + /* Trampoline (need one for each virtual function) */ + bool match(const dl::ident& pattern, const dl::ident& candidate) + const noexcept(false) override { + PYBIND11_OVERLOAD_PURE( + bool, /* Return type */ + dl::matcher, /* Parent class */ + match, /* Name of function in C++ (must match Python name) */ + pattern, /* Argument(s) */ + candidate + ); + } +}; + +void report( const std::vector< std::error_code >& codes, + const std::string& context ) noexcept (false) { + + py::module logging = py::module::import("logging"); + using level = dl::parsing_severity; + for (const auto& code : codes) { + const std::string msg = context + ": " + code.message(); + + if (code == level::debug) logging.attr("debug")(msg); + else if (code == level::info) logging.attr("info")(msg); + else if (code == level::warning) logging.attr("warning")(msg); + /* default to logging.warning for now - should never get her anyway*/ + else logging.attr("warning")(msg); + }; } +} + +PYBIND11_MAKE_OPAQUE( std::vector< dl::object_set > ) +PYBIND11_MAKE_OPAQUE( std::vector< dl::index_entry > ) + PYBIND11_MODULE(core, m) { PyDateTime_IMPORT; @@ -705,6 +754,9 @@ PYBIND11_MODULE(core, m) { } }); + py::bind_vector>(m, "list(object_set)"); + py::bind_vector>(m, "list(index_entry)"); + m.def("open", &dl::open, py::arg("path"), py::arg("zero") = 0); m.def("open_rp66", &dl::open_rp66); m.def("open_tif", &dl::open_tapeimage); @@ -816,26 +868,67 @@ PYBIND11_MODULE(core, m) { }) ; - py::class_< dl::object_set >( m, "object_set" ) - .def_readonly( "type", &dl::object_set::type ) - .def_readonly( "name", &dl::object_set::name ) - .def_property_readonly("objects", - [](const dl::object_set& object_set) { - py::dict objects; - for (const auto& object : object_set.objects) { - py::dict obj; - for (const auto& attr : object.attributes) { - auto label = py::str(dl::decay(attr.label)); - // TODO: need units? So far they're not used - obj[label] = dl::decay(attr.value); - } - const auto& name = dl::decay(object.object_name); - objects[py::cast(name)] = obj; + py::class_< dl::basic_object >( m, "basic_object" ) + .def_readonly("type", &dl::basic_object::type) + .def_readonly("name", &dl::basic_object::object_name) + .def( "__len__", []( const dl::basic_object& o ) { + return o.attributes.size(); + }) + .def( "__eq__", &dl::basic_object::operator == ) + .def( "__ne__", &dl::basic_object::operator != ) + .def( "__getitem__", []( dl::basic_object& o, const std::string& key ) { + dl::object_attribute attr; + try { + attr = o.at(key); + } catch (const std::out_of_range& e) { + throw py::key_error( e.what() ); + } + + if (attr.info.size()) { + const auto msg = o.object_name.fingerprint(dl::decay( o.type )) + + "-A." + + dl::decay( attr.label ); + + std::vector< std::error_code > codes( attr.info.begin(), + attr.info.end() ); + + report(codes, msg); + } + + return attr.value; + }) + .def( "__repr__", []( const dl::basic_object& o ) { + return "dlisio.core.basic_object(name={})"_s + .format(o.object_name); + }) + .def( "keys", []( const dl::basic_object& o ){ + std::vector< dl::ident > keys; + for ( auto attr : o.attributes ) { + keys.push_back( attr.label ); } - return objects; + return keys; }) ; + py::class_< dl::object_set >( m, "object_set" ) + .def_readonly( "type", &dl::object_set::type ) + .def_readonly( "name", &dl::object_set::name ) + .def( "objects", &dl::object_set::objects ) + ; + + py::class_< dl::pool >( m, "logical_file" ) + .def_property_readonly( "types", &dl::pool::types ) + .def( "match", (dl::object_vector (dl::pool::*) ( + const std::string&, + const std::string&, + const dl::matcher& + )) &dl::pool::match ) + .def( "match", (dl::object_vector (dl::pool::*) ( + const std::string&, + const dl::matcher& + )) &dl::pool::match ) + ; + py::enum_< dl::representation_code >( m, "reprc" ) .value( "fshort", dl::representation_code::fshort ) .value( "fsingl", dl::representation_code::fsingl ) @@ -884,6 +977,13 @@ PYBIND11_MODULE(core, m) { }) ; + py::class_< dl::index_entry >( m, "index_entry") + .def_property_readonly( "explicit", &dl::index_entry::isexplicit ) + .def_property_readonly( "encrypted", &dl::index_entry::isencrypted ) + .def_readonly( "code", &dl::index_entry::code ) + .def_readonly( "tell", &dl::index_entry::tell ) + ; + py::class_< dl::stream >( m, "stream" ) .def_property_readonly("absolute_tell", &dl::stream::absolute_tell) .def("seek", &dl::stream::seek) @@ -945,9 +1045,8 @@ PYBIND11_MODULE(core, m) { std::vector< dl::object_set > objects; for (const auto& rec : recs) { if (rec.isencrypted()) continue; - auto begin = rec.data.data(); - auto end = begin + rec.data.size(); - objects.push_back( dl::parse_objects( begin, end ) ); + if (rec.data.size() == 0) continue; + objects.push_back( dl::object_set( rec.data ) ); } return objects; }); @@ -959,9 +1058,31 @@ PYBIND11_MODULE(core, m) { m.def( "findoffsets", []( dl::stream& file ) { const auto ofs = dl::findoffsets( file ); - return py::make_tuple( ofs.explicits, ofs.implicits ); + return py::make_tuple( ofs.explicits, ofs.implicits, ofs.encrypted ); + }); + + m.def( "metadata_pool", []( dl::stream& s, + const std::vector< dl::index_entry >& index ) { + std::vector< dl::object_set > tmp; + for (const auto& eflr : index) { + if (eflr.isencrypted()) continue; + auto rec = dl::extract(s, eflr.tell); + if (rec.data.size() == 0) continue; + tmp.push_back( dl::object_set( rec.data ) ); + } + return dl::pool{ std::move(tmp) }; }); m.def("set_encodings", set_encodings); m.def("get_encodings", get_encodings); + + py::class_< dl::matcher, Pymatcher >( m, "matcher") + .def(py::init<>()) + .def("match", &dl::matcher::match) + ; + + py::class_< dl::exactmatch, dl::matcher >( m, "exactmatch" ) + .def(py::init<>()) + .def("match", &dl::exactmatch::match) + ; } diff --git a/python/dlisio/plumbing/__init__.py b/python/dlisio/plumbing/__init__.py index 58a1f70e4..602aaee41 100644 --- a/python/dlisio/plumbing/__init__.py +++ b/python/dlisio/plumbing/__init__.py @@ -22,6 +22,7 @@ from .process import Process from .unknown import Unknown +from .matcher import * from .valuetypes import * from .linkage import * from .utils import * diff --git a/python/dlisio/plumbing/basicobject.py b/python/dlisio/plumbing/basicobject.py index c9d92738b..1e2c88a93 100644 --- a/python/dlisio/plumbing/basicobject.py +++ b/python/dlisio/plumbing/basicobject.py @@ -196,7 +196,7 @@ def __getitem__(self, key): Returns a default value for missing attributes. I.e. attributes defined in :attr:`attributes` but are not in :attr:`attic`. """ - if key not in self.attributes and key not in self.attic: + if key not in self.attributes and key not in self.attic.keys(): raise KeyError("'{}'".format(key)) try: @@ -215,12 +215,18 @@ def __getitem__(self, key): if key in self.linkage and isreference(rp66value[0]): reftype = self.linkage[key] - value = [lookup(self, reftype, v) for v in rp66value] + value = [lookup(self.logicalfile, reftype, v) for v in rp66value] else: value = [v.strip() if isinstance(v, str) else v for v in rp66value] return parsevalue(value, parse_as) + def __eq__(self, rhs): + try: + return self.attic == rhs.attic + except AttributeError: + return False + @property def fingerprint(self): """ Object fingerprint @@ -253,9 +259,9 @@ def stash(self): all attributes not defined in :attr:`attributes` """ stash = { - key : value - for key, value - in self.attic.items() + key : self.attic[key] + for key + in self.attic.keys() if key not in self.attributes } diff --git a/python/dlisio/plumbing/channel.py b/python/dlisio/plumbing/channel.py index e5d49e684..72e8cb0a5 100644 --- a/python/dlisio/plumbing/channel.py +++ b/python/dlisio/plumbing/channel.py @@ -86,16 +86,28 @@ class Channel(BasicObject): def __init__(self, obj = None, name = None, lf=None): super().__init__(obj, name = name, type = 'CHANNEL', lf=lf) - # The numpy data type of the sample array - self._frame = None @property def frame(self): - if self._frame is not None: - return lookup(self, obname('FRAME'), self._frame) + if self.logicalfile is None: + msg = 'Unable to lookup frame, {} has no logical file' + logging.info(msg.format(self)) + return None + + # Find the frame(s) that are claiming ownership over this channel + frames = findframe(self.fingerprint, self.logicalfile) + + if len(frames) == 1: + return lookup(self.logicalfile, obname('FRAME'), frames[0]) + + if len(frames) == 0: + msg = '{} does not belong to any Frame' + logging.info(msg.format(self)) + + if len(frames) > 1: + msg = '{} belong to multiple frames. Candidates are {}' + logging.info(msg.format(self, frames)) - msg = '{} does not belong to any Frame' - logging.info(msg.format(self)) return None @property @@ -221,7 +233,7 @@ def curves(self): >>> curve[0][1][2] 6 """ - if self._frame is not None: + if self.frame is not None: return np.copy(self.frame.curves()[self.fingerprint]) msg = 'There is no recorded curve-data for {}' diff --git a/python/dlisio/plumbing/frame.py b/python/dlisio/plumbing/frame.py index fa69d904b..2a9288044 100644 --- a/python/dlisio/plumbing/frame.py +++ b/python/dlisio/plumbing/frame.py @@ -123,8 +123,11 @@ def spacing(self): @property def encrypted(self): - if 'ENCRYPTED' in self.attic: return True - else: return False + try: + _ = self.attic['ENCRYPTED'] + return True + except KeyError: + return False @property def index_min(self): @@ -480,22 +483,6 @@ def fmtstrchannel(self, channel): return pre_fmt, ch_fmt, post_fmt - def link(self): - # Reference from a Channel to the Frame it belongs to is not explicitly - # present in file. However it is very convenient that Channels are - # aware of their parent frame. Without a this reference present in the - # file, its the Frame's responsibility to update all it's Channel with - # a reference back to itself. - for ch in self.channels: - try: - if ch._frame: - msg = '{} already belongs to {}, ownership given to {}' - logging.warning(msg.format(ch, ch.frame, self)) - ch._frame = core.obname(self.origin, self.copynumber, self.name) - except AttributeError: - #happens if ch has been parsed as other type - pass - def describe_attr(self, buf, width=80, indent='', exclude=''): if len(self.channels) > 0: if self.index_type is not None: diff --git a/python/dlisio/plumbing/linkage.py b/python/dlisio/plumbing/linkage.py index 535f51057..5aefc3750 100644 --- a/python/dlisio/plumbing/linkage.py +++ b/python/dlisio/plumbing/linkage.py @@ -5,32 +5,28 @@ def obname(objtype): def fingerprint(obj): if not isinstance(obj, core.obname): raise TypeError - return obj.fingerprint(objtype), objtype + return obj, objtype return fingerprint def objref(obj): if not isinstance(obj, core.objref): raise TypeError - return obj.fingerprint, obj.type + return obj.name, obj.type -def lookup(obj, reftype, value): +def lookup(lf, reftype, value): """Create a fingerprint from reftype(value) and look up corresponding - object in the logical file of obj.""" + object in the logical file.""" try: - fp, objtype = reftype(value) + name, objtype = reftype(value) except TypeError: msg = "Unable to create object-reference to '{}'" logging.warning(msg.format(value)) return None try: - return obj.logicalfile[objtype][fp] - except KeyError: - msg = "Referenced object '{}' not in logical file" - logging.warning(msg.format(fp)) - return None - except TypeError: - msg = 'Unable to find referenced object, {} has no logical file' - logging.warning(msg.format(obj)) + return lf.object(objtype, name.id, name.origin, name.copynumber) + except ValueError as e: + msg = "Unable to find linked object: {}" + logging.warning(msg.format(str(e))) return None def isreference(val): @@ -39,3 +35,13 @@ def isreference(val): return (isinstance (val, core.obname) or isinstance (val, core.objref) or isinstance (val, core.attref)) + +def findframe(fp, logical_file): + """Find all frames containing the channel""" + frames = [] + for frame in logical_file.frames: + for channel in frame.channels: + if channel.fingerprint != fp: continue + frames.append(frame.attic.name) + + return frames diff --git a/python/dlisio/plumbing/matcher.py b/python/dlisio/plumbing/matcher.py new file mode 100644 index 000000000..6b6734980 --- /dev/null +++ b/python/dlisio/plumbing/matcher.py @@ -0,0 +1,36 @@ +import re + +from .. import core + +class regex_matcher(core.matcher): + """ Regex matcher + + A regex matcher using Python's re module, that can be passed to + dl::pool::match along with the search patterns. + + Examples + -------- + + create a matcher that is case insensitive and displays debug information, + and pass it to dl::pool with the search patterns for type and name: + + >>> m = matcher(re.IGNORECASE | re.DEBUG) + >>> result = metadata.match("TYPE", "NAME", m) + """ + def __init__(self, flags): + core.matcher.__init__(self) + self.flags = flags + + def match(self, pattern, candidate): + """ Overrides dl::matcher::match """ + try: + compiled = re.compile(str(pattern), flags=self.flags) + except: + msg = 'Invalid regex: {}'.format(pattern) + raise ValueError(msg) + + if (re.match(compiled, str(candidate))): + return True + else: + return False + diff --git a/python/dlisio/plumbing/wellref.py b/python/dlisio/plumbing/wellref.py index b30ba4a51..2b3334266 100644 --- a/python/dlisio/plumbing/wellref.py +++ b/python/dlisio/plumbing/wellref.py @@ -94,8 +94,16 @@ def coordinates(self): value = 'COORDINATE-{}-VALUE' for i in range(1, 4): - key = self.attic.get(name.format(i), [custom_label.format(i)])[0] - val = self.attic.get(value.format(i), [None])[0] + try: + key = self.attic[name.format(i)][0] + except KeyError: + key = custom_label.format(i) + + try: + val = self.attic[value.format(i)][0] + except KeyError: + val = None + coordinates[key] = val return coordinates diff --git a/python/tests/conftest.py b/python/tests/conftest.py index 6f0a38e45..f951d2f40 100644 --- a/python/tests/conftest.py +++ b/python/tests/conftest.py @@ -110,6 +110,13 @@ def assert_message(message_id): assert any([message_id in r.message for r in caplog.records]) return assert_message +@pytest.fixture +def assert_debug(caplog): + caplog.set_level(logging.DEBUG) + def assert_message(message_id): + assert any([message_id in r.message for r in caplog.records]) + return assert_message + @pytest.fixture(scope="module") def fpath(tmpdir_factory, merge_files_manyLR): """ diff --git a/python/tests/test_basic_object.py b/python/tests/test_basic_object.py index d6e4577aa..05c6a80ec 100644 --- a/python/tests/test_basic_object.py +++ b/python/tests/test_basic_object.py @@ -19,19 +19,13 @@ def test_getitem_defaultvalue(f): assert obj['INDEX-TYPE'] is None def test_getitem_unexpected_attr(f): - obj = f.object('FRAME', 'FRAME2') - - try: - obj.attic['NEW-ATTR'] = [1] - - # Attributes unknown to dlisio, such as 'NEW-ATTR' should be reachable - # through __getitem__ - assert obj['NEW-ATTR'] == [1] + obj = f.object('CALIBRATION-COEFFICIENT', 'COEFF_BAD') + # Attributes unknown to dlisio, such as 'NEW-ATTR' should be reachable + # through __getitem__ + assert obj['LNKS'] == [18, 32] - # Should also be in stash - assert obj.stash['NEW-ATTR'] == [1] - finally: - del obj.attic['NEW-ATTR'] + # Should also be in stash + assert obj.stash['LNKS'] == [18, 32] def test_getitem_noattribute(f): obj = f.object('FRAME', 'FRAME2') @@ -42,53 +36,31 @@ def test_getitem_noattribute(f): with pytest.raises(KeyError): _ = obj['DUMMY'] -def test_lookup(): - other = Channel() - other.name = 'channel' - other.origin = 10 - other.copynumber = 2 - - lf = dlisio.dlis(None, [], [], []) - lf.indexedobjects['CHANNEL'] = {other.fingerprint : other} - - ch = Channel() - ch.logicalfile = lf - - value = dlisio.core.obname(10, 2, 'channel') - res = lookup(ch, linkage.obname('CHANNEL'), value) +def test_lookup(f): + value = dlisio.core.obname(10, 0, 'CHANN2') + res = lookup(f, linkage.obname('CHANNEL'), value) - assert res == other + assert res.long_name == "CHANN2-LONG-NAME" -def test_lookup_value_not_a_ref(assert_log): - res = lookup(None, linkage.objref, 0) +def test_lookup_value_not_a_ref(f, assert_log): + res = lookup(f, linkage.objref, 0) assert res is None assert_log('Unable to create object-reference') -def test_lookup_value_should_be_objref(assert_log): +def test_lookup_value_should_be_objref(f, assert_log): value = dlisio.core.obname(10, 2, 'channel') - res = lookup(None, linkage.objref, value) + res = lookup(f, linkage.objref, value) assert res is None assert_log('Unable to create object-reference') -def test_lookup_no_logicalfile(assert_log): - value = dlisio.core.obname(10, 2, 'channel') - ch = Channel() #channel without reference to a logical file - - res = lookup(ch, linkage.obname('CHANNEL'), value) - - assert res is None - assert_log('has no logical file') - -def test_lookup_no_such_object(assert_log): +def test_lookup_no_such_object(f, assert_log): value = dlisio.core.obname(10, 2, 'channel') - ch = Channel() - ch.logicalfile = dlisio.dlis(None, [], [], []) - res = lookup(ch, linkage.obname('CHANNEL'), value) + res = lookup(f, linkage.obname('CHANNEL'), value) assert res is None - assert_log('not in logical file') + assert_log('Unable to find linked object') @pytest.mark.xfail(strict=True, reason="attempt to link empty fingerprint") def test_link_empty_object(tmpdir_factory, merge_files_manyLR): diff --git a/python/tests/test_curves.py b/python/tests/test_curves.py index c6ad25291..b3b2fa1b0 100644 --- a/python/tests/test_curves.py +++ b/python/tests/test_curves.py @@ -16,63 +16,6 @@ def load_curves(fpath): curves = frame.curves() return curves -def makeframe(): - frame = dlisio.plumbing.Frame() - frame.name = 'MAINFRAME' - frame.origin = 0 - frame.copynumber = 0 - - time0 = dlisio.plumbing.Channel() - time0.name = 'TIME' - time0.origin = 0 - time0.copynumber = 0 - attic = { - 'DIMENSION': [1], - 'REPRESENTATION-CODE' : [2] # f4 - } - time0.attic = attic - - tdep = dlisio.plumbing.Channel() - tdep.name = 'TDEP' - tdep.origin = 0 - tdep.copynumber = 0 - attic = { - 'DIMENSION': [2], - 'REPRESENTATION-CODE' : [13] # i2 - } - tdep.attic = attic - - time1 = dlisio.plumbing.Channel() - time1.name = 'TIME' - time1.origin = 1 - time1.copynumber = 0 - attic = { - 'DIMENSION' : [1], - 'REPRESENTATION-CODE' : [13], # i2 - } - time1.attic = attic - - #frame.channels = [time0, tdep, time1] - frame.attic = { - 'CHANNELS' : [core.obname(time0.origin, time0.copynumber, time0.name), - core.obname(tdep.origin, tdep.copynumber, tdep.name), - core.obname(time1.origin, time1.copynumber, time1.name)] - } - - logicalfile = dlisio.dlis(None, [], [], []) - logicalfile.indexedobjects['FRAME'] = { frame.fingerprint : frame } - logicalfile.indexedobjects['CHANNEL'] = { - time0.fingerprint : time0, - tdep.fingerprint : tdep, - time1.fingerprint : time1, - } - for objs in logicalfile.indexedobjects.values(): - for obj in objs.values(): - obj.logicalfile = logicalfile - - frame.link() - return frame - def test_curves_are_copy(f): # All channel.curves() really does is to slice the full frame array # returned by frame.curves(). Make sure the returned slice is a copy not a @@ -219,20 +162,21 @@ def test_dimensions_in_multifdata(): np.testing.assert_array_equal(curves[1][1], [[7, 8, 9], [10, 11, 12]]) def test_duplicated_mnemonics_get_unique_labels(): - frame = makeframe() - assert 'ifDDD' == frame.fmtstr() - dtype = frame.dtype() + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "MAINFRAME") + assert 'ifDDD' == frame.fmtstr() + dtype = frame.dtype() - assert ('FRAMENO', 'TIME.0.0', 'TDEP', 'TIME.1.0') == dtype.names + assert ('FRAMENO', 'TIME.0.0', 'TDEP', 'TIME.1.0') == dtype.names - fields = [ - 'FRAMENO', - frame.channels[0].fingerprint, - frame.channels[1].fingerprint, - frame.channels[2].fingerprint, - ] + fields = [ + 'FRAMENO', + frame.channels[0].fingerprint, + frame.channels[1].fingerprint, + frame.channels[2].fingerprint, + ] - assert all(x in dtype.fields for x in fields) + assert all(x in dtype.fields for x in fields) def test_duplicated_mnemonics_dtype_supports_buffer_protocol(): @@ -249,24 +193,21 @@ def test_duplicated_mnemonics_dtype_supports_buffer_protocol(): # the IDENT type, but dlisio imposes no such restriction) # # https://github.com/equinor/dlisio/pull/97 - frame = makeframe() - _ = memoryview(np.zeros(1, dtype = frame.dtype())) + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "MAINFRAME") + _ = memoryview(np.zeros(1, dtype = frame.dtype())) -def test_duplicated_signatures(f, assert_log): - frame = f.object('FRAME', 'FRAME1') +def test_duplicated_signatures(assert_log): + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "DUPLICATED") + with pytest.raises(Exception): + _ = frame.curves() + assert_log("duplicated mnemonics") - frame.channels[1].name = frame.channels[0].name - frame.channels[1].origin = frame.channels[0].origin - frame.channels[1].copynumber = frame.channels[0].copynumber + curves = frame.curves(strict=False) + names = curves.dtype.names - with pytest.raises(Exception): - _ = frame.curves() - assert_log("duplicated mnemonics") - - curves = frame.curves(strict=False) - names = curves.dtype.names - - assert names == ('FRAMENO', 'CHANN1.10.0(0)', 'CHANN1.10.0(1)') + assert names == ('FRAMENO', 'DUPL.0.0(0)', 'DUPL.0.0(1)') def test_mkunique(): types = [ @@ -289,35 +230,38 @@ def test_mkunique(): def test_channel_order(): - frame = makeframe() + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "MAINFRAME") - ref = [("TIME", 0), ("TDEP", 0), ("TIME", 1)] + ref = [("TIME", 0), ("TDEP", 0), ("TIME", 1)] - for i, ch in enumerate(frame.channels): - assert ch.name == ref[i][0] - assert ch.origin == ref[i][1] + for i, ch in enumerate(frame.channels): + assert ch.name == ref[i][0] + assert ch.origin == ref[i][1] def test_dtype(): - frame = makeframe() + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "MAINFRAME") - dtype = np.dtype([ - ('FRAMENO', np.int32), - ((frame.channels[0].fingerprint, 'TIME.0.0'), np.float32), - ((frame.channels[1].fingerprint, 'TDEP'), np.int16, (2,)), - ((frame.channels[2].fingerprint, 'TIME.1.0'), np.int16), - ]) + dtype = np.dtype([ + ('FRAMENO', np.int32), + ((frame.channels[0].fingerprint, 'TIME.0.0'), np.float32), + ((frame.channels[1].fingerprint, 'TDEP'), np.int16, (2,)), + ((frame.channels[2].fingerprint, 'TIME.1.0'), np.int16), + ]) - assert frame.dtype() == dtype + assert frame.dtype() == dtype def test_dtype_fmt_instance(): - frame = makeframe() - frame.dtype_fmt = 'x-{:s} {:d}~{:d}' + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "MAINFRAME") + frame.dtype_fmt = 'x-{:s} {:d}~{:d}' - # fmtstr is unchanged - assert 'ifDDD' == frame.fmtstr() - expected_names = ('FRAMENO', 'x-TIME 0~0', 'TDEP', 'x-TIME 1~0') - assert expected_names == frame.dtype().names + # fmtstr is unchanged + assert 'ifDDD' == frame.fmtstr() + expected_names = ('FRAMENO', 'x-TIME 0~0', 'TDEP', 'x-TIME 1~0') + assert expected_names == frame.dtype().names def test_dtype_fmt_class(): original = dlisio.plumbing.Frame.dtype_format @@ -325,10 +269,11 @@ def test_dtype_fmt_class(): try: # change dtype before the object itself is constructed, so it dlisio.plumbing.Frame.dtype_format = 'x-{:s} {:d}~{:d}' - frame = makeframe() - expected_names = ('FRAMENO', 'x-TIME 0~0', 'TDEP', 'x-TIME 1~0') - assert expected_names == frame.dtype().names - assert 'ifDDD' == frame.fmtstr() + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "MAINFRAME") + expected_names = ('FRAMENO', 'x-TIME 0~0', 'TDEP', 'x-TIME 1~0') + assert expected_names == frame.dtype().names + assert 'ifDDD' == frame.fmtstr() finally: # even if the test fails, make sure the format string is reset to its @@ -340,12 +285,12 @@ def test_dtype_fmt_class(): ("x-{:s}.{:d}.{:d}.{:d}"), ]) def test_dtype_wrong_fmt(fmt, assert_log): - frame = makeframe() - - frame.dtype_fmt = fmt - with pytest.raises(Exception): - _ = frame.dtype().names - assert_log("rich label") + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "MAINFRAME") + frame.dtype_fmt = fmt + with pytest.raises(Exception): + _ = frame.dtype().names + assert_log("rich label") def test_channel_curves(): @@ -362,138 +307,55 @@ def test_channel_curves(): frame_curves = load_curves(fpath) assert frame_curves['CH22'] == curves22 -def test_channel_curves_duplicated_mnemonics(f): - frame = f.object('FRAME', 'FRAME1') - frame.channels[1].name = frame.channels[0].name - frame.channels[1].copynumber = frame.channels[0].copynumber+ 1 +def test_channel_curves_duplicated_mnemonics(): + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "MAINFRAME") + channel = f.object("CHANNEL", "TIME", 0, 0) + curve = channel.curves() - ch = frame.channels[0] - curve = ch.curves() - - np.testing.assert_array_equal(curve, frame.curves()[ch.fingerprint]) + np.testing.assert_array_equal(curve, + frame.curves()[channel.fingerprint]) def test_channel_without_frame(assert_info): - channel = dlisio.plumbing.Channel() + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + channel = f.object("CHANNEL", "BELONG_TO_NO_FRAME") + assert channel.curves() == None + assert_info('no recorded curve-data') + + assert channel.frame == None + assert_info('does not belong') + assert channel.curves() == None assert_info('no recorded curve-data') - assert channel.frame == None - assert_info('does not belong') - def test_channel_fmt(): - ch1 = dlisio.plumbing.Channel() - ch1.name = 'ch1' - ch1.origin = 0 - ch1.copynumber = 0 - ch1.attic = { - 'DIMENSION': [5], - 'REPRESENTATION-CODE': [11], - } - - ch2 = dlisio.plumbing.Channel() - ch2.name = 'ch2' - ch2.origin = 0 - ch2.copynumber = 0 - ch2.attic = { - 'DIMENSION': [2, 2], - 'REPRESENTATION-CODE': [3], - } - - ch3 = dlisio.plumbing.Channel() - ch3.name = 'ch3' - ch3.origin = 0 - ch3.copynumber = 0 - ch3.attic = { - 'DIMENSION': [4, 2], - 'REPRESENTATION-CODE': [26], - } - - ch4 = dlisio.plumbing.Channel() - ch4.name = 'ch4' - ch4.origin = 0 - ch4.copynumber = 0 - ch4.attic = { - 'DIMENSION': [1], - 'REPRESENTATION-CODE': [17], - } - - ch5 = dlisio.plumbing.Channel() - ch5.name = 'ch5' - ch5.origin = 0 - ch5.copynumber = 0 - ch5.attic = { - 'DIMENSION': [2, 3, 1], - 'REPRESENTATION-CODE': [12], - } - - frame = dlisio.plumbing.Frame() - frame.name = 'fr' - frame.origin = 0 - frame.copynumber = 0 - frame.attic = { - 'CHANNELS': [ - core.obname(ch1.origin, ch1.copynumber, ch1.name), - core.obname(ch2.origin, ch2.copynumber, ch2.name), - core.obname(ch3.origin, ch3.copynumber, ch3.name), - core.obname(ch4.origin, ch4.copynumber, ch4.name), - core.obname(ch5.origin, ch5.copynumber, ch5.name), - ] - } - - logicalfile = dlisio.dlis(None, [], [], []) - logicalfile.indexedobjects['FRAME'] = { - frame.fingerprint: frame - } - - logicalfile.indexedobjects['CHANNEL'] = { - ch1.fingerprint: ch1, - ch2.fingerprint: ch2, - ch3.fingerprint: ch3, - ch4.fingerprint: ch4, - ch5.fingerprint: ch5, - } - frame.logicalfile = logicalfile - ch1.logicalfile = logicalfile - ch2.logicalfile = logicalfile - ch3.logicalfile = logicalfile - ch4.logicalfile = logicalfile - ch5.logicalfile = logicalfile - - pre_fmt, ch_fmt, post_fmt = frame.fmtstrchannel(ch3) - assert pre_fmt == "CCCCCbbbb" - assert ch_fmt == "qqqqqqqq" - assert post_fmt == "Ldddddd" - + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "VARIOUS") + channel = f.object("CHANNEL", "chn3") + pre_fmt, ch_fmt, post_fmt = frame.fmtstrchannel(channel) + assert pre_fmt == "CCCCCbbbb" + assert ch_fmt == "qqqqqqqq" + assert post_fmt == "Ldddddd" def test_channel_no_dimension(assert_log): - ch = dlisio.plumbing.Channel() - ch.name = 'CH' - ch.origin = 0 - ch.copynumber = 0 - ch.attic = {'REPRESENTATION-CODE': [17]} - - with pytest.raises(ValueError) as exc: - ch.fmtstr() - assert "channel.dimension is invalid" in str(exc.value) - - ch.attic['DIMENSION'] = [1] - assert ch.fmtstr() == "L" - + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + ch = f.object("CHANNEL", "NODIM") + with pytest.raises(ValueError) as exc: + ch.fmtstr() + assert "channel.dimension is invalid" in str(exc.value) def test_frame_index(): - frame = makeframe() - frame.attic['INDEX-TYPE'] = ['DECREASING'] - - assert frame.index == frame.channels[0].name + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "MAINFRAME") + assert frame.index == frame.channels[0].name def test_frame_index_absent(assert_info): - frame = makeframe() - assert frame.index == 'FRAMENO' - -def test_frame_index_absent_nochannels(assert_info): - frame = dlisio.plumbing.Frame() - frame.attic['INDEX-TYPE'] = ['DECREASING'] - - assert frame.index is None - assert_info('Frame has no channels') - + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "NONINDEXED") + assert frame.index == 'FRAMENO' + +def test_frame_index_nochannels(assert_info): + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "INDEXED_NO_CHANNELS") + assert frame.index is None + assert_info('Frame has no channels') diff --git a/python/tests/test_encodings.py b/python/tests/test_encodings.py index 35e060a97..bdc6504e8 100644 --- a/python/tests/test_encodings.py +++ b/python/tests/test_encodings.py @@ -46,7 +46,8 @@ def test_broken_utf8_value(tmpdir, merge_files_oneLR): dlisio.set_encodings([]) with pytest.warns(UnicodeWarning): with dlisio.load(path) as (f, *_): - f.load() + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT') + _ = obj['DEFAULT_ATTRIBUTE'] try: dlisio.set_encodings(['koi8_r']) with dlisio.load(path) as (f, *_): @@ -57,7 +58,6 @@ def test_broken_utf8_value(tmpdir, merge_files_oneLR): finally: dlisio.set_encodings(prev_encodings) -@pytest.mark.skip(reason="not warn on warning and sigabrt on second") def test_broken_utf8_obname_value(tmpdir, merge_files_oneLR): path = os.path.join(str(tmpdir), 'broken_utf8_obname_value.dlis') content = [ @@ -67,20 +67,24 @@ def test_broken_utf8_obname_value(tmpdir, merge_files_oneLR): 'data/chap3/objattr/broken-utf8-obname-value.dlis.part', ] merge_files_oneLR(path, content) - with pytest.warns(UnicodeWarning): - with dlisio.load(path): - pass + prev_encodings = dlisio.get_encodings() + dlisio.set_encodings([]) + try: + f, = dlisio.load(path) + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) + obname = obj['DEFAULT_ATTRIBUTE'][0] + + with pytest.warns(UnicodeWarning): + _ = obname.id + dlisio.set_encodings(['koi8_r']) - with dlisio.load(path) as (f, *_): - obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) - obname = (2, 2, 'КОТ') - assert obj.attic['DEFAULT_ATTRIBUTE'] == [obname] + assert obname.id == 'КОТ' finally: dlisio.set_encodings(prev_encodings) + f.close() -@pytest.mark.xfail(strict=True, reason="fingerprint error on no encoding") def test_broken_utf8_object_name(tmpdir, merge_files_oneLR): #some actual files have obname which fails with utf-8 codec path = os.path.join(str(tmpdir), 'broken_utf8_object_name.dlis') @@ -90,18 +94,24 @@ def test_broken_utf8_object_name(tmpdir, merge_files_oneLR): 'data/chap3/object/broken-utf8-object.dlis.part', ] merge_files_oneLR(path, content) - with pytest.warns(UnicodeWarning): - with dlisio.load(path): - pass + prev_encodings = dlisio.get_encodings() + dlisio.set_encodings([]) + try: + f, = dlisio.load(path) + with pytest.warns(UnicodeWarning): + _ = f.match('.*', 'VERY_MUCH_TESTY_SET') + dlisio.set_encodings(['koi8_r']) - with dlisio.load(path) as (f, *_): - _ = f.object('VERY_MUCH_TESTY_SET', 'КАДР', 12, 4) + + objs = f.match('.*', 'VERY_MUCH_TESTY_SET') + [obj] = [x for x in objs] + assert obj.name == 'КАДР' finally: dlisio.set_encodings(prev_encodings) + f.close() -@pytest.mark.xfail(strict=True, reason="could not allocate string object") def test_broken_utf8_label(tmpdir, merge_files_oneLR): path = os.path.join(str(tmpdir), 'broken_utf8_label.dlis') content = [ @@ -110,19 +120,23 @@ def test_broken_utf8_label(tmpdir, merge_files_oneLR): 'data/chap3/object/object.dlis.part', ] merge_files_oneLR(path, content) - with pytest.warns(UnicodeWarning): - with dlisio.load(path): - pass + prev_encodings = dlisio.get_encodings() + dlisio.set_encodings([]) + try: + f, = dlisio.load(path) + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) + + with pytest.warns(UnicodeWarning): + _ = obj.attic.keys() + dlisio.set_encodings(['koi8_r']) - with dlisio.load(path) as (f, *_): - obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) - assert obj.attic['ДОХЛЫЙ-ПАРАМЕТР'] == ['Have a nice day!'] + assert 'ДОХЛЫЙ-ПАРАМЕТР' in obj.attic.keys() finally: + f.close() dlisio.set_encodings(prev_encodings) -@pytest.mark.xfail(strict=True, reason="fingerprint error on no encoding") @pytest.mark.future_test_set_names def test_broken_utf8_set(tmpdir, merge_files_oneLR): path = os.path.join(str(tmpdir), 'broken_utf8_set.dlis') @@ -133,16 +147,20 @@ def test_broken_utf8_set(tmpdir, merge_files_oneLR): 'data/chap3/object/object.dlis.part', ] merge_files_oneLR(path, content) - with pytest.warns(UnicodeWarning): - with dlisio.load(path) as (f, *_): - f.load() + prev_encodings = dlisio.get_encodings() + dlisio.set_encodings([]) + try: + f, = dlisio.load(path) + with pytest.warns(UnicodeWarning): + _ = f.metadata.types + dlisio.set_encodings(['koi8_r']) - with dlisio.load(path) as (f, *_): - _ = f.object('СЕТ_КИРИЛЛИЦЕЙ', 'OBJECT', 1, 1) - #assert set_name == 'МЕНЯ.ЗОВУТ.СЕТ' + assert 'СЕТ_КИРИЛЛИЦЕЙ' in f.metadata.types + #assert set_name == 'МЕНЯ.ЗОВУТ.СЕТ' finally: dlisio.set_encodings(prev_encodings) + f.close() diff --git a/python/tests/test_logical_file.py b/python/tests/test_logical_file.py index 323660148..7f36cab1c 100644 --- a/python/tests/test_logical_file.py +++ b/python/tests/test_logical_file.py @@ -11,173 +11,113 @@ from dlisio import core -@pytest.fixture(scope="module") -def g(): - s = dlisio.open("tests/test_logical_file.py") #any existing file is required - g = dlisio.dlis(s, [], [], []) - - ch = Channel() - ch.name = 'CHANNEL1' - ch.origin = 0 - ch.copynumber = 0 - ch.logicalfile = g - g.indexedobjects["CHANNEL"][ch.fingerprint] = ch - - ch = Channel() - ch.name = 'CHANNEL1.V2' - ch.origin = 0 - ch.copynumber = 0 - ch.logicalfile = g - g.indexedobjects["CHANNEL"][ch.fingerprint] = ch - - ch = Channel() - ch.name = 'CHANNEL1' - ch.origin = 0 - ch.copynumber = 1 - ch.logicalfile = g - g.indexedobjects["CHANNEL"][ch.fingerprint] = ch - - un = Unknown() - un.name = 'UNEFRAME' - un.origin = 0 - un.copynumber = 0 - un.type = "NONCHANNEL" - un.logicalfile = g - g.indexedobjects["NONCHANNEL"][un.fingerprint] = un - - fr = Frame() - fr.name = 'UNEFRAME' - fr.origin = 0 - fr.copynumber = 0 - fr.logicalfile = g - g.indexedobjects["FRAME"][fr.fingerprint] = fr - - un = Unknown() - un.name = '440-CHANNEL' - un.origin = 0 - un.copynumber = 0 - un.type = "440.TYPE" - un.logicalfile = g - g.indexedobjects["440.TYPE"][un.fingerprint] = un - - ch = Channel() - ch.name = '440.CHANNEL' - ch.origin = 0 - ch.copynumber = 0 - ch.type = "440-TYPE" - ch.logicalfile = g - g.indexedobjects["440-TYPE"][ch.fingerprint] = ch - - g.record_types = list(g.indexedobjects.keys()) - - # Simulate the occurance of multiple Channel sets - g.record_types.append('CHANNEL') - return g - -def test_object(g): - channel = g.object("CHANNEL", "CHANNEL1", 0, 1) - assert channel.name == "CHANNEL1" - assert channel.origin == 0 - assert channel.copynumber == 1 +def test_object(f): + channel = f.object("CHANNEL", "CHANN1", 10, 0) + assert channel.name == "CHANN1" + assert channel.origin == 10 + assert channel.copynumber == 0 assert channel.type == "CHANNEL" -def test_object_unknown(g): - channel = g.object("NONCHANNEL", "UNEFRAME", 0, 0) - assert channel.name == "UNEFRAME" - assert channel.origin == 0 +def test_object_unknown(f): + channel = f.object("UNKNOWN_SET", "OBJ1", 10, 0) + assert channel.name == "OBJ1" + assert channel.origin == 10 assert channel.copynumber == 0 - assert channel.type == "NONCHANNEL" + assert channel.type == "UNKNOWN_SET" -def test_object_nonexisting(g): +def test_object_nonexisting(f): with pytest.raises(ValueError) as exc: - _ = g.object("UNKNOWN_TYPE", "SOME_OBJECT", 0, 0) + _ = f.object("UNKNOWN_TYPE", "SOME_OBJECT", 0, 0) assert "not found" in str(exc.value) with pytest.raises(ValueError): - _ = g.object("CHANNEL", "CHANNEL1", 11, 0) + _ = f.object("CHANNEL", "CHANN1", 11, 0) - with pytest.raises(TypeError): - _ = g.object("WEIRD", "CHANNEL1", "-1", "-1") + with pytest.raises(ValueError): + _ = f.object("WEIRD", "CHANN1", "-1", "-1") -def test_object_solo_nameonly(g): - channel = g.object("CHANNEL", "CHANNEL1.V2") - assert channel.name == "CHANNEL1.V2" - assert channel.origin == 0 +def test_object_solo_nameonly(f): + channel = f.object("CHANNEL", "CHANN2") + assert channel.name == "CHANN2" + assert channel.origin == 10 assert channel.copynumber == 0 - assert channel.type == "CHANNEL" + assert channel.type == "CHANNEL" -def test_object_nonexisting_nameonly(g): +def test_object_nonexisting_nameonly(f): with pytest.raises(ValueError) as exc: - _ = g.object("CHANNEL", "NOTFOUND") + _ = f.object("CHANNEL", "NOTFOUND") assert "No objects" in str(exc.value) -def test_object_many_objects_nameonly(g): - with pytest.raises(ValueError) as exc: - _ = g.object("CHANNEL", "CHANNEL1") - assert "There are multiple" in str(exc.value) - -def test_match(g): - refs = [] - refs.append( g.object('CHANNEL', 'CHANNEL1', 0, 0) ) - refs.append( g.object('CHANNEL', 'CHANNEL1.V2', 0, 0) ) - refs.append( g.object('CHANNEL', 'CHANNEL1', 0, 1) ) +def test_object_many_objects_nameonly(tmpdir_factory, merge_files_manyLR): + with dlisio.load("data/chap4-7/match.dlis") as (f, *_): + with pytest.raises(ValueError) as exc: + _ = f.object("CHANNEL", "MATCH1") + assert "There are multiple" in str(exc.value) - channels = g.match('.*chan.*') +def test_match(): + with dlisio.load("data/chap4-7/match.dlis") as (f, *_): + refs = [] + refs.append(f.object('CHANNEL', 'MATCH1', 16, 0)) + refs.append(f.object('CHANNEL', 'MATCH111', 16, 0)) + refs.append(f.object('CHANNEL', 'MATCH1', 127, 0)) - assert len(list(channels)) == 3 - for ch in channels: - assert ch in refs + channels = f.match('.*match1.*') - channels = g.match('.*chan.*', ".*") - assert len(list(channels)) == 5 + assert len(list(channels)) == 3 + for ch in channels: + assert ch in refs -def test_match_type(g): - refs = [] + channels = f.match('.*match1.*', ".*") + assert len(list(channels)) == 5 - refs.append( g.object('NONCHANNEL', 'UNEFRAME', 0, 0) ) - refs.append( g.object('FRAME', 'UNEFRAME', 0, 0) ) +def test_match_type(): + with dlisio.load("data/chap4-7/match.dlis") as (f, *_): + refs = [] + refs.append( f.object('MATCH', 'MATCH22', 16, 0) ) + refs.append( f.object('FRAME', 'MATCH22', 16, 0) ) - objs = g.match('UNEFR.*', type='NONCHANNEL|FRAME') + objs = f.match('MATCH2.*', type='MATCH|FRAME') - assert len(list(objs)) == len(refs) - for obj in objs: - assert obj in refs + assert len(list(objs)) == len(refs) + for obj in objs: + assert obj in refs - objs = g.match('', type='NONCHANNEL|frame') + objs = f.match('', type='MATCH|frame') - assert len(list(objs)) == len(refs) - for obj in objs: - assert obj in refs + assert len(list(objs)) == len(refs) + for obj in objs: + assert obj in refs -def test_match_invalid_regex(g): +def test_match_invalid_regex(f): with pytest.raises(ValueError): - _ = next(g.match('*')) + _ = next(f.match('*')) with pytest.raises(ValueError): - _ = next(g.match('AIBK', type='*')) + _ = next(f.match('AIBK', type='*')) -def test_match_special_characters(g): - o1 = g.object('440.TYPE', '440-CHANNEL', 0, 0) - o2 = g.object('440-TYPE', '440.CHANNEL', 0, 0) +def test_match_special_characters(): + with dlisio.load("data/chap4-7/match.dlis") as (f, *_): + o1 = f.object('440.TYPE', '440-MATCH1', 16, 0) + o2 = f.object('440-TYPE', '440.MATCH1', 16, 0) - refs = [o1, o2] - channels = g.match('440.CHANNEL', '440.TYPE') + refs = [o1, o2] + channels = f.match('440.MATCH1', '440.TYPE') - assert len(list(channels)) == 2 - for ch in channels: - assert ch in refs + assert len(list(channels)) == 2 + for ch in channels: + assert ch in refs - refs = [o1] - channels = g.match('440-CHANNEL', '440.TYPE') - assert len(list(channels)) == 1 - for ch in channels: - assert ch in refs + refs = [o1] + channels = f.match('440-MATCH1', '440.TYPE') + assert len(list(channels)) == 1 + for ch in channels: + assert ch in refs - refs = [o2] - channels = g.match('440.CHANNEL', '440-TYPE') - assert len(list(channels)) == 1 - for ch in channels: - assert ch in refs + refs = [o2] + channels = f.match('440.MATCH1', '440-TYPE') + assert len(list(channels)) == 1 + for ch in channels: + assert ch in refs def test_indexedobjects(f): assert f.fileheader.name == "N" @@ -202,52 +142,7 @@ def test_indexedobjects(f): assert len(f.comments) == 1 assert len(f.messages) == 1 -def test_indexedobjects_initial_load(fpath): - with dlisio.load(fpath) as (f, *tail): - # Only fileheader, origin, frame, and channel should be loaded - assert len(f.indexedobjects) == 4 - - -def test_indexedobjects_load_all(fpath): - with dlisio.load(fpath) as (f, *_): - f.load() - assert len(f.indexedobjects) == 23 - -def test_indexedobjects_load_unknowns(): +def test_load_unknowns(): with dlisio.load('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as (f,): - assert len(f.indexedobjects) == 4 #FILE-HEADER, ORIGIN, FRAME, CHANNEL - assert len(f.unknowns) == 5 - assert len(f.indexedobjects) == 9 - -def test_indexedobjects_load_by_typeloading(fpath): - with dlisio.load(fpath) as (f, *tail): - fp = core.fingerprint('PARAMETER', 'PARAM1', 10, 0) - parameters = f.parameters - - assert len(parameters) == 3 - assert fp in f.indexedobjects['PARAMETER'] - -def test_indexedobjects_load_by_direct_call(fpath): - with dlisio.load(fpath) as (f, *tail): - fp = core.fingerprint('TOOL', 'TOOL1', 10, 0) - _ = f.object('TOOL', 'TOOL1', 10, 0) - - assert fp in f.indexedobjects['TOOL'] - -def test_indexedobjects_load_by_match(fpath): - with dlisio.load(fpath) as (f, *tail): - fp = core.fingerprint('MESSAGE', 'MESSAGE1', 10, 0) - - _ = list(f.match('.*' , type='MESSAGE')) - - assert fp in f.indexedobjects['MESSAGE'] - -def test_indexedobjects_load_by_link(fpath): - with dlisio.load(fpath) as (f, *tail): - fp = core.fingerprint('LONG-NAME', 'CHANN1-LONG-NAME', 10, 0) - ch = f.object('CHANNEL', 'CHANN1') + assert len(f.unknowns) == 5 - # Accessing long-name should trigger loading of all long-names - _ = ch.long_name - assert fp in f.indexedobjects['LONG-NAME'] - assert len(f.indexedobjects['LONG-NAME']) == 4 diff --git a/python/tests/test_logical_record.py b/python/tests/test_logical_record.py index 4f3fab24f..5ec7b2e08 100644 --- a/python/tests/test_logical_record.py +++ b/python/tests/test_logical_record.py @@ -45,7 +45,7 @@ def test_default_attribute_cut(tmpdir, merge_files_oneLR): assert obj.attic['DEFAULT_ATTRIBUTE'] @pytest.mark.future_test_attributes -def test_invariant_attribute(tmpdir, merge_files_oneLR): +def test_invariant_attribute(tmpdir, merge_files_oneLR, assert_info): path = os.path.join(str(tmpdir), 'invariant_attribute.dlis') content = [ 'data/chap3/start.dlis.part', @@ -62,9 +62,9 @@ def test_invariant_attribute(tmpdir, merge_files_oneLR): #assert attr.units == 'invariant units' assert attr == [False, False, True] + assert_info('Invariant attribute') -@pytest.mark.future_warning_invariant_attribute -def test_invariant_attribute_in_object(tmpdir, merge_files_oneLR): +def test_invariant_attribute_in_object(tmpdir, merge_files_oneLR, assert_debug): path = os.path.join(str(tmpdir), 'invariant-attribute-in-object.dlis') content = [ 'data/chap3/start.dlis.part', @@ -79,6 +79,7 @@ def test_invariant_attribute_in_object(tmpdir, merge_files_oneLR): attr = obj.attic['DEFAULT_ATTRIBUTE'] assert attr == [8.0] + assert_debug('attr.invariant') def test_absent_attribute(tmpdir, merge_files_oneLR): path = os.path.join(str(tmpdir), 'attribute_absent.dlis') @@ -301,8 +302,7 @@ def test_repcode_invalid_in_objects(tmpdir, merge_files_oneLR): f.load() assert "unknown representation code" in str(excinfo.value) -@pytest.mark.future_warning_repcode_different_no_value -def test_repcode_different_no_value(tmpdir, merge_files_oneLR): +def test_repcode_different_no_value(tmpdir, merge_files_oneLR, assert_log, assert_debug): path = os.path.join(str(tmpdir), 'different-repcode-no-value.dlis') content = [ 'data/chap3/start.dlis.part', @@ -316,8 +316,11 @@ def test_repcode_different_no_value(tmpdir, merge_files_oneLR): obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) assert obj.attic['DEFAULT_ATTRIBUTE'] == [0j, 0j] -@pytest.mark.future_test_attributes -def test_count0_novalue(tmpdir, merge_files_oneLR): + assert_debug('!attr.value') + assert_debug('attr.reprc != templ.reprc') + assert_log('defaulted by dlisio') + +def test_count0_novalue(tmpdir, merge_files_oneLR, assert_info): path = os.path.join(str(tmpdir), 'count0-novalue.dlis') content = [ 'data/chap3/start.dlis.part', @@ -333,9 +336,10 @@ def test_count0_novalue(tmpdir, merge_files_oneLR): #assert attr.count == 0 assert attr == None + assert_info('value is undefined') -@pytest.mark.future_test_attributes -def test_count0_value_bit(tmpdir, merge_files_oneLR): + +def test_count0_value_bit(tmpdir, merge_files_oneLR, assert_info): path = os.path.join(str(tmpdir), 'count0-value-bit.dlis') content = [ 'data/chap3/start.dlis.part', @@ -351,6 +355,7 @@ def test_count0_value_bit(tmpdir, merge_files_oneLR): #assert attr.count == 0 assert attr == None + assert_info('value is undefined') @pytest.mark.future_test_attributes def test_count0_different_repcode(tmpdir, merge_files_oneLR): @@ -371,8 +376,7 @@ def test_count0_different_repcode(tmpdir, merge_files_oneLR): assert attr == None -@pytest.mark.future_warning_label_bit_set_in_object_attr -def test_label_bit_set_in_attribute(tmpdir, merge_files_oneLR): +def test_label_bit_set_in_attribute(tmpdir, merge_files_oneLR, assert_log, assert_debug): path = os.path.join(str(tmpdir), 'label_bit_set_in_attribute.dlis') content = [ 'data/chap3/start.dlis.part', @@ -384,8 +388,11 @@ def test_label_bit_set_in_attribute(tmpdir, merge_files_oneLR): with dlisio.load(path) as (f, *tail): obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) - assert obj.attic['DEFAULT_ATTRIBUTE'] + assert obj['DEFAULT_ATTRIBUTE'] + assert_debug('attr.label') + assert_debug('!attr.value') + assert_log('defaulted by dlisio') @pytest.mark.future_warning_label_bit_not_set_in_template @pytest.mark.not_implemented_datetime_timezone @@ -436,8 +443,7 @@ def test_object_name_bit_not_set_in_object(tmpdir, merge_files_oneLR): assert obj.attic['DEFAULT_ATTRIBUTE'] -@pytest.mark.future_test_attributes -def test_novalue_less_count(tmpdir, merge_files_oneLR): +def test_novalue_less_count(tmpdir, merge_files_oneLR, assert_log, assert_debug): path = os.path.join(str(tmpdir), 'novalue-less-count.dlis') content = [ 'data/chap3/start.dlis.part', @@ -455,6 +461,9 @@ def test_novalue_less_count(tmpdir, merge_files_oneLR): #assert attr.units == 'default attr units' assert attr == [-0.75] + assert_debug('!attr.value, attr.count') + assert_debug('attr.count < tmpl.count') + assert_log('defaulted by dlisio') @pytest.mark.not_implemented def test_novalue_more_count(tmpdir, merge_files_oneLR): @@ -555,7 +564,6 @@ def test_unexpected_attribute_in_set(tmpdir, merge_files_oneLR): dlisio.load(path) assert "expected SET" in str(excinfo.value) - def test_unexpected_set_in_object(tmpdir, merge_files_oneLR): path = os.path.join(str(tmpdir), 'unexpected-set-in-object.dlis') content = [ @@ -631,9 +639,7 @@ def test_cut_before_object(tmpdir, merge_files_oneLR): ] merge_files_oneLR(path, content) with dlisio.load(path) as (f,): - objects = {} - for v in f.indexedobjects.values(): - objects.update(v) + objects = f.match('.*', '.*') assert len(objects) == 0 diff --git a/python/tests/test_monkey_patching.py b/python/tests/test_monkey_patching.py index a1fdb4895..c5874868f 100644 --- a/python/tests/test_monkey_patching.py +++ b/python/tests/test_monkey_patching.py @@ -52,7 +52,7 @@ def test_type_new(f): finally: del f.types['UNKNOWN_SET'] - +@pytest.mark.skip(reason='typechange not propegated to dl::pool') def test_type_change(f): try: # Parse all parameters as if they where Channels @@ -100,12 +100,8 @@ def test_type_removal(f): # to not interfere with other tests f.types['CHANNEL'] = dlisio.plumbing.Channel - f.load() - obj = f.object('CHANNEL', 'CHANN1', 10, 0) - # Channels should now be parsed as Channel.allobjects - assert isinstance(obj, dlisio.plumbing.Channel) - assert obj not in f.unknowns + assert 'CHANNEL' not in f.unknowns def test_attribute_change_in_instance(): ch1 = Channel() diff --git a/python/tests/test_object_structures.py b/python/tests/test_object_structures.py index 291f3c6d1..967cec25c 100644 --- a/python/tests/test_object_structures.py +++ b/python/tests/test_object_structures.py @@ -5,6 +5,7 @@ from datetime import datetime import numpy as np +import pytest import dlisio def test_file_header(f): diff --git a/python/tests/test_partitioning.py b/python/tests/test_partitioning.py index d481fe4df..8eb94684c 100644 --- a/python/tests/test_partitioning.py +++ b/python/tests/test_partitioning.py @@ -10,10 +10,7 @@ def test_partitioning(): assert len(tail) == 0 def getobjects(f): - objects = {} - for v in f.indexedobjects.values(): - objects.update(v) - return objects + return f.match('.*', '.*') assert len(getobjects(f1)) == 8 assert len(getobjects(f2)) == 7 @@ -21,14 +18,14 @@ def getobjects(f): key = dlisio.core.fingerprint('FRAME', 'FRAME-INC', 10, 0) - assert f1.record_types == ['ORIGIN', 'CHANNEL', 'FRAME'] + assert f1.metadata.types == ['ORIGIN', 'CHANNEL', 'FRAME'] assert not f1.fdata_index - assert f2.record_types == ['FILE-HEADER', 'ORIGIN', 'CHANNEL', - 'FRAME', 'FRAME'] + assert f2.metadata.types == ['FILE-HEADER', 'ORIGIN', 'CHANNEL', + 'FRAME', 'FRAME'] assert f2.fdata_index[key] == [824, 1060] - assert f3.record_types == ['FILE-HEADER'] + assert f3.metadata.types == ['FILE-HEADER'] assert not f3.fdata_index def test_objects_ownership(): diff --git a/python/tests/test_physical_layout.py b/python/tests/test_physical_layout.py index 615572937..0e21c419b 100644 --- a/python/tests/test_physical_layout.py +++ b/python/tests/test_physical_layout.py @@ -45,19 +45,19 @@ def test_lrs_atributes_inconsistency(): def test_padbytes_as_large_as_record(): path = 'data/chap2/padbytes-large-as-record.dlis' with dlisio.load(path) as (f,): - assert len(f.attic) == 0 + assert len(f.match('.*', '.*')) == 0 assert len(f.fdata_index) == 0 def test_padbytes_as_large_as_segment_explicit(): path = 'data/chap2/padbytes-large-as-seg-explict.dlis' with dlisio.load(path) as (f,): - assert len(f.attic) == 0 + assert len(f.match('.*', '.*')) == 0 assert len(f.fdata_index) == 0 def test_padbytes_as_large_as_segment_implicit(): path = 'data/chap2/padbytes-large-as-seg-implicit.dlis' with dlisio.load(path) as (f,): - assert len(f.attic) == 0 + assert len(f.match('.*', '.*')) == 0 assert len(f.fdata_index) == 0 def test_padbytes_bad():