From f8e349fdb89af65a7a60f3b99d0cf93c9545cffb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Wed, 17 Jun 2020 13:33:50 +0200 Subject: [PATCH 01/27] Attach object type to basic_object The type of an object is defined on the object set, not on the actual object, thus an instances of basic_object must remain linked to it's set in order to have a full representation of the object. The concept of a object set is not semantically intresting after the objects in the set are parsed. But because of the type, the set has to be carried through all the way to python. By setting the type directly on the basic_object when it's parsed the need to keep it close to the set goes away. It also fasilitates the posibility of removing object sets from the C++ API all together. --- lib/extension/dlisio/ext/types.hpp | 1 + lib/src/parse.cpp | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index e32db3960..bfb595220 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -459,6 +459,7 @@ struct basic_object { bool operator != (const basic_object&) const noexcept (true); dl::obname object_name; + dl::ident type; std::vector< object_attribute > attributes; }; diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index 2041a0e11..4cd5894f6 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -845,6 +845,7 @@ noexcept (false) } object_vector parse_objects( const object_template& tmpl, + const dl::ident type, const char* cur, const char* end ) noexcept (false) { @@ -859,6 +860,7 @@ object_vector parse_objects( const object_template& tmpl, cur += DLIS_DESCRIPTOR_SIZE; auto current = default_object; + current.type = type; if (object_flags.name) cur = cast( cur, current.object_name ); for (const auto& template_attr : tmpl) { @@ -998,7 +1000,7 @@ object_set parse_objects( const char* cur, const char* end ) { if (std::distance( cur, end ) == 0) return set; - set.objects = parse_objects( set.tmpl, cur, end ); + set.objects = parse_objects( set.tmpl, set.type, cur, end ); return set; } From df589fda6aa9f71f5683c7361c62f9eec35907cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Tue, 30 Jun 2020 12:59:37 +0200 Subject: [PATCH 02/27] Formalize and enrich the index Logical Record Segment Headers contain useful information when determining the content of the record. Previously we had to extract the entire record in order to retreive information about it's content - often requiring our control flow to extract more records than needed. The lrsh are already being read when indexing so the only cost here is an increase in the size of the index. --- lib/extension/dlisio/ext/io.hpp | 6 +++--- lib/extension/dlisio/ext/types.hpp | 16 +++++++++++++++ lib/src/io.cpp | 32 +++++++++++++++++++----------- python/dlisio/__init__.py | 4 ++-- python/dlisio/ext/core.cpp | 7 +++++++ 5 files changed, 48 insertions(+), 17 deletions(-) diff --git a/lib/extension/dlisio/ext/io.hpp b/lib/extension/dlisio/ext/io.hpp index d65a5b648..96bf185c3 100644 --- a/lib/extension/dlisio/ext/io.hpp +++ b/lib/extension/dlisio/ext/io.hpp @@ -65,8 +65,8 @@ class stream { struct stream_offsets { - std::vector< long long > explicits; - std::vector< long long > implicits; + std::vector< index_entry > explicits; + std::vector< index_entry > implicits; }; stream open(const std::string&, std::int64_t) noexcept (false); @@ -80,7 +80,7 @@ bool hastapemark(stream&) noexcept (false); record extract(stream&, long long) noexcept (false); record& extract(stream&, long long, long long, record&) noexcept (false); -stream_offsets findoffsets(dl::stream&) noexcept (false); +dl::stream_offsets findoffsets(dl::stream&) noexcept (false); std::vector< std::pair< std::string, long long > > findfdata(dl::stream&, const std::vector< long long >&) noexcept (false); diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index bfb595220..fb734afee 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -492,6 +492,22 @@ const char* parse_set_component( const char*, ident*, int* ) noexcept (false); +/** index_entry - The whereabouts of a Logical Record + * + * index_entry does not contain the data from the Logical Record Segment + * Bodies. Rather, it contains the position of a Logical Record within a file, + * the Logical Record Segment Header attributes - and code (logical record + * type). + */ +struct index_entry { + long long tell; + std::uint8_t code; // Logical Record Type - Appendix A + std::uint8_t attributes; + + bool isexplicit() const noexcept (true); + bool isencrypted() const noexcept (true); +}; + } #endif //DLISIO_EXT_TYPES_HPP diff --git a/lib/src/io.cpp b/lib/src/io.cpp index 706626c0a..b568dde98 100644 --- a/lib/src/io.cpp +++ b/lib/src/io.cpp @@ -156,6 +156,14 @@ bool record::isencrypted() const noexcept (true) { return this->attributes & DLIS_SEGATTR_ENCRYPT; } +bool index_entry::isexplicit() const noexcept (true) { + return this->attributes & DLIS_SEGATTR_EXFMTLR; +} + +bool index_entry::isencrypted() const noexcept (true) { + return this->attributes & DLIS_SEGATTR_ENCRYPT; +} + namespace { template < typename T > @@ -387,8 +395,8 @@ record& extract(stream& file, long long tell, long long bytes, record& rec) noex } } -stream_offsets findoffsets( dl::stream& file) noexcept (false) { - stream_offsets ofs; +dl::stream_offsets findoffsets(dl::stream& file) noexcept (false) { + dl::stream_offsets idx; std::int64_t offset = 0; char buffer[ DLIS_LRSH_SIZE ]; @@ -406,7 +414,7 @@ stream_offsets findoffsets( dl::stream& file) noexcept (false) { int isexplicit = attrs & DLIS_SEGATTR_EXFMTLR; if (not (attrs & DLIS_SEGATTR_PREDSEG)) { - if (isexplicit and type == 0 and ofs.explicits.size()) { + if (isexplicit and type == 0 and idx.explicits.size()) { /* * Wrap up when we encounter a EFLR of type FILE-HEADER that is * NOT the first Logical Record. More precisely we expect the @@ -417,18 +425,18 @@ stream_offsets findoffsets( dl::stream& file) noexcept (false) { file.seek( offset ); break; } - if (isexplicit) ofs.explicits.push_back( offset ); - /* - * Consider doing fdata-indexing on the fly as we are now at the - * correct offset to read the OBNAME. That would mean we only need - * to traverse the file a single time to index it. Additionally it - * would make the caller code from python way nicer. - */ - else ofs.implicits.push_back( offset ); + + index_entry entry; + entry.tell = offset; + entry.code = type; + entry.attributes = attrs; + + if (entry.isexplicit()) idx.explicits.push_back(entry); + else idx.implicits.push_back(entry); } offset += len; } - return ofs; + return idx; } std::vector< std::pair< std::string, long long > > diff --git a/python/dlisio/__init__.py b/python/dlisio/__init__.py index 3bd959f31..c41b91d9c 100644 --- a/python/dlisio/__init__.py +++ b/python/dlisio/__init__.py @@ -793,9 +793,9 @@ def rewind(offset, tif): explicits, implicits = core.findoffsets(stream) hint = rewind(stream.absolute_tell, tapemarks) - records = core.extract(stream, explicits) + records = core.extract(stream, [x.tell for x in explicits]) fdata_index = defaultdict(list) - for key, val in core.findfdata(stream, implicits): + for key, val in core.findfdata(stream, [x.tell for x in implicits]): fdata_index[key].append(val) lf = dlis(stream, records, fdata_index, sul) diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index c22c94fab..6d1ff37c3 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -884,6 +884,13 @@ PYBIND11_MODULE(core, m) { }) ; + py::class_< dl::index_entry >( m, "index_entry") + .def_property_readonly( "explicit", &dl::index_entry::isexplicit ) + .def_property_readonly( "encrypted", &dl::index_entry::isencrypted ) + .def_readonly( "code", &dl::index_entry::code ) + .def_readonly( "tell", &dl::index_entry::tell ) + ; + py::class_< dl::stream >( m, "stream" ) .def_property_readonly("absolute_tell", &dl::stream::absolute_tell) .def("seek", &dl::stream::seek) From f715cc737930cd02dc3b6aee2049f7aa7c54a092 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Tue, 30 Jun 2020 14:11:53 +0200 Subject: [PATCH 03/27] Add lrsh consistency checks to the index_entry When index_entry contains attribute and code it makes sense that the sanity checking of these header fields are done when creating the index. Also these checks help validate the index, which is useful if we want to allow an incomplete index to propagate further. --- lib/extension/dlisio/ext/types.hpp | 14 ++++++++++ lib/src/io.cpp | 42 +++++++++++++++++++++++------- 2 files changed, 47 insertions(+), 9 deletions(-) diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index fb734afee..3bd496cc8 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -498,11 +498,25 @@ const char* parse_set_component( const char*, * Bodies. Rather, it contains the position of a Logical Record within a file, * the Logical Record Segment Header attributes - and code (logical record * type). + * + * consistency indicates if the header information code and attributes are + * consistent across all Logical Record Segment Headers. + * + * I.e. code should hold the same value in all lrsh. While all the bits in + * attributes should be the same across all lrsh, with exception of the + * predecessor and successor bits which should have the following structure: + * + * lrsh predecessor successor + * 0 0 1 + * 1 1 1 + * .. + * n 1 0 */ struct index_entry { long long tell; std::uint8_t code; // Logical Record Type - Appendix A std::uint8_t attributes; + bool consistent; bool isexplicit() const noexcept (true); bool isencrypted() const noexcept (true); diff --git a/lib/src/io.cpp b/lib/src/io.cpp index b568dde98..fd16e3001 100644 --- a/lib/src/io.cpp +++ b/lib/src/io.cpp @@ -397,44 +397,68 @@ record& extract(stream& file, long long tell, long long bytes, record& rec) noex dl::stream_offsets findoffsets(dl::stream& file) noexcept (false) { dl::stream_offsets idx; + shortvec< std::uint8_t > attributes; + shortvec< int > types; + bool consistent = true; - std::int64_t offset = 0; + std::int64_t offset_segment = 0; + std::int64_t offset_record = offset_segment; char buffer[ DLIS_LRSH_SIZE ]; int len = 0; while (true) { - file.seek(offset); + file.seek(offset_segment); file.read(buffer, DLIS_LRSH_SIZE); - if (file.eof()) + if (file.eof()) { + // TODO: check attributes and types -> should be empty break; + } int type; std::uint8_t attrs; dlis_lrsh( buffer, &len, &attrs, &type ); + attributes.push_back( attrs ); + types.push_back( type ); + int isexplicit = attrs & DLIS_SEGATTR_EXFMTLR; + + // Start of a new record if (not (attrs & DLIS_SEGATTR_PREDSEG)) { if (isexplicit and type == 0 and idx.explicits.size()) { /* * Wrap up when we encounter a EFLR of type FILE-HEADER that is - * NOT the first Logical Record. More precisely we expect the + * NOT the first Logical offset_record. More precisely we expect the * _first_ LR we encounter to be a FILE-HEADER. We gather up * this LR and all successive LR's until we encounter another * FILE-HEADER. */ - file.seek( offset ); + file.seek( offset_segment ); break; } + offset_record = offset_segment; + } + offset_segment += len; + + // We reached the last LRS in the current LR - check consistency and + // add to index + if (not (attrs & DLIS_SEGATTR_SUCCSEG)) { index_entry entry; - entry.tell = offset; - entry.code = type; - entry.attributes = attrs; + entry.tell = offset_record; + entry.code = types.front(); + entry.attributes = attributes.front(); + entry.consistent = consistent; + + if (not attr_consistent( attributes )) entry.consistent = false; + if (not type_consistent( types )) entry.consistent = false; if (entry.isexplicit()) idx.explicits.push_back(entry); else idx.implicits.push_back(entry); + + attributes.clear(); + types.clear(); } - offset += len; } return idx; } From ff05946bd2d9c5bbc3a50781828f25db1e08c142 Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Wed, 15 Jul 2020 14:44:42 +0200 Subject: [PATCH 04/27] Pass logical file directly as lookup parameter For lookup it's not actually interesting to know which object orders the lookup. We need barely the reference to logical file itself (which contains object pool). With the idea of moving tests to files and stopping their on-fly creation, situation where object will not have a logical file assigned is not supposed to happen anymore. --- python/dlisio/plumbing/basicobject.py | 2 +- python/dlisio/plumbing/channel.py | 2 +- python/dlisio/plumbing/linkage.py | 10 +++------- python/tests/test_basic_object.py | 11 +---------- 4 files changed, 6 insertions(+), 19 deletions(-) diff --git a/python/dlisio/plumbing/basicobject.py b/python/dlisio/plumbing/basicobject.py index c9d92738b..1fd4abc92 100644 --- a/python/dlisio/plumbing/basicobject.py +++ b/python/dlisio/plumbing/basicobject.py @@ -215,7 +215,7 @@ def __getitem__(self, key): if key in self.linkage and isreference(rp66value[0]): reftype = self.linkage[key] - value = [lookup(self, reftype, v) for v in rp66value] + value = [lookup(self.logicalfile, reftype, v) for v in rp66value] else: value = [v.strip() if isinstance(v, str) else v for v in rp66value] diff --git a/python/dlisio/plumbing/channel.py b/python/dlisio/plumbing/channel.py index e5d49e684..8a149dc7d 100644 --- a/python/dlisio/plumbing/channel.py +++ b/python/dlisio/plumbing/channel.py @@ -92,7 +92,7 @@ def __init__(self, obj = None, name = None, lf=None): @property def frame(self): if self._frame is not None: - return lookup(self, obname('FRAME'), self._frame) + return lookup(self.logicalfile, obname('FRAME'), self._frame) msg = '{} does not belong to any Frame' logging.info(msg.format(self)) diff --git a/python/dlisio/plumbing/linkage.py b/python/dlisio/plumbing/linkage.py index 535f51057..864ff346d 100644 --- a/python/dlisio/plumbing/linkage.py +++ b/python/dlisio/plumbing/linkage.py @@ -12,9 +12,9 @@ def objref(obj): if not isinstance(obj, core.objref): raise TypeError return obj.fingerprint, obj.type -def lookup(obj, reftype, value): +def lookup(lf, reftype, value): """Create a fingerprint from reftype(value) and look up corresponding - object in the logical file of obj.""" + object in the logical file.""" try: fp, objtype = reftype(value) except TypeError: @@ -23,15 +23,11 @@ def lookup(obj, reftype, value): return None try: - return obj.logicalfile[objtype][fp] + return lf[objtype][fp] except KeyError: msg = "Referenced object '{}' not in logical file" logging.warning(msg.format(fp)) return None - except TypeError: - msg = 'Unable to find referenced object, {} has no logical file' - logging.warning(msg.format(obj)) - return None def isreference(val): """Check if val is a rp66 reference typ""" diff --git a/python/tests/test_basic_object.py b/python/tests/test_basic_object.py index d6e4577aa..a337715a3 100644 --- a/python/tests/test_basic_object.py +++ b/python/tests/test_basic_object.py @@ -55,7 +55,7 @@ def test_lookup(): ch.logicalfile = lf value = dlisio.core.obname(10, 2, 'channel') - res = lookup(ch, linkage.obname('CHANNEL'), value) + res = lookup(lf, linkage.obname('CHANNEL'), value) assert res == other @@ -72,15 +72,6 @@ def test_lookup_value_should_be_objref(assert_log): assert res is None assert_log('Unable to create object-reference') -def test_lookup_no_logicalfile(assert_log): - value = dlisio.core.obname(10, 2, 'channel') - ch = Channel() #channel without reference to a logical file - - res = lookup(ch, linkage.obname('CHANNEL'), value) - - assert res is None - assert_log('has no logical file') - def test_lookup_no_such_object(assert_log): value = dlisio.core.obname(10, 2, 'channel') ch = Channel() From fc4f6a8cb61e7438be97e38b4189c3f94826a6bd Mon Sep 17 00:00:00 2001 From: Alena Chaikouskaya Date: Wed, 15 Jul 2020 14:39:52 +0200 Subject: [PATCH 05/27] Replace on-fly object creation in tests with files Creation of objects on fly isn't so easy anymore as we need to supply more and more values, and implementation keeps changing. Hence rewriting some tests to be based on actual files instead. This commit doesn't touch however on-fly tests which refered to singular objects and didn't need logical_file property set as for now they don't seem to be affected. --- python/data/chap4-7/README.rst | 6 + python/data/chap4-7/frame-channels.dlis | Bin 0 -> 660 bytes python/data/chap4-7/match.dlis | Bin 0 -> 440 bytes python/tests/test_basic_object.py | 33 +-- python/tests/test_curves.py | 330 +++++++----------------- python/tests/test_logical_file.py | 216 ++++++---------- 6 files changed, 190 insertions(+), 395 deletions(-) create mode 100644 python/data/chap4-7/frame-channels.dlis create mode 100644 python/data/chap4-7/match.dlis diff --git a/python/data/chap4-7/README.rst b/python/data/chap4-7/README.rst index 7dfda70c5..e719a19eb 100644 --- a/python/data/chap4-7/README.rst +++ b/python/data/chap4-7/README.rst @@ -140,6 +140,9 @@ Remaining files ================================================ ================================================== Filename Description ================================================ ================================================== +frame-channels.dlis Contains frames with channels with specific + repcodes and signatures for curves testing + invalid-date-in-origin.dlis Simple file which contains invalid creation time attribute in origin @@ -151,4 +154,7 @@ many-logical-files-error-in-last.dlis Contains several logical files, many-logical-files-same-object.dlis Contains 2 logical files with the same objects and encrypted records +match.dlis Various combinations of objects to test match + functions as they require specific name patterns + ================================================ ================================================== diff --git a/python/data/chap4-7/frame-channels.dlis b/python/data/chap4-7/frame-channels.dlis new file mode 100644 index 0000000000000000000000000000000000000000..62480e596f459f9a6eb526363c79621e4d849e44 GIT binary patch literal 660 zcma)2O-}+b5bbtZKUR%FO-wv>j(flWE(sp%cGM=&Vs|lmBE+mpfmq&Ze+(xqo{Zl5PbrJj30wD&>90dR zr)0KsDO*^%MlR5V`!U{hQAJo$-S!2VqmW8*H1>(3z+aAT0%z=f^pdjS=xOlKAF{{6 zFkN1+MaZ}H|C+k9&FGpdg{|<=V8>vLD&dLLsuF2|`;BcO+o)nI95`b_N{KCjLe2$t zU=QfnDfGQu7u`+^oSZ*Azt&&}yzH&zb#MbpX;GcQNkB`BDunX#O#^m=5_R0n$5Yxa z0cB#jue6d%;2jF^=G_Mu*wX5%m*}3-y-UOGejDJDADi!cTAF!tDcmYz#>Z8EKA@7^ zchFAUZgZ*TJ%L-~%fLokr1Awu26-&imF+()>@&B;OaiMt|F>M(IZKfJj*NJMBB0hO cD+{oEo*F;p*t(8f3yB>>3#A>gVjL>lfg9H`+qcQ9+?N zKQ~n&CqF$iIWb2eEi)$-E=?4J;RVoss|Mx|?9LvJetxb#Cc;6k0YR?8u6`koA)fwz zy3YPCuKcE)E}p)we!)NiA;yAw1~y;E5N8iV1~(FQ3cz$02rzI!%rG=$5Nlxh!0Hy{ x=<5m;W(Nx!839FGKCps0$f7bpQ4SLm1HF*Q0FZJnAX^ve8kn3elAIny4ggLLX5;_> literal 0 HcmV?d00001 diff --git a/python/tests/test_basic_object.py b/python/tests/test_basic_object.py index a337715a3..8c41b828d 100644 --- a/python/tests/test_basic_object.py +++ b/python/tests/test_basic_object.py @@ -42,41 +42,28 @@ def test_getitem_noattribute(f): with pytest.raises(KeyError): _ = obj['DUMMY'] -def test_lookup(): - other = Channel() - other.name = 'channel' - other.origin = 10 - other.copynumber = 2 +def test_lookup(f): + value = dlisio.core.obname(10, 0, 'CHANN2') + res = lookup(f, linkage.obname('CHANNEL'), value) - lf = dlisio.dlis(None, [], [], []) - lf.indexedobjects['CHANNEL'] = {other.fingerprint : other} + assert res.long_name == "CHANN2-LONG-NAME" - ch = Channel() - ch.logicalfile = lf - - value = dlisio.core.obname(10, 2, 'channel') - res = lookup(lf, linkage.obname('CHANNEL'), value) - - assert res == other - -def test_lookup_value_not_a_ref(assert_log): - res = lookup(None, linkage.objref, 0) +def test_lookup_value_not_a_ref(f, assert_log): + res = lookup(f, linkage.objref, 0) assert res is None assert_log('Unable to create object-reference') -def test_lookup_value_should_be_objref(assert_log): +def test_lookup_value_should_be_objref(f, assert_log): value = dlisio.core.obname(10, 2, 'channel') - res = lookup(None, linkage.objref, value) + res = lookup(f, linkage.objref, value) assert res is None assert_log('Unable to create object-reference') -def test_lookup_no_such_object(assert_log): +def test_lookup_no_such_object(f, assert_log): value = dlisio.core.obname(10, 2, 'channel') - ch = Channel() - ch.logicalfile = dlisio.dlis(None, [], [], []) - res = lookup(ch, linkage.obname('CHANNEL'), value) + res = lookup(f, linkage.obname('CHANNEL'), value) assert res is None assert_log('not in logical file') diff --git a/python/tests/test_curves.py b/python/tests/test_curves.py index c6ad25291..72e6002e2 100644 --- a/python/tests/test_curves.py +++ b/python/tests/test_curves.py @@ -16,63 +16,6 @@ def load_curves(fpath): curves = frame.curves() return curves -def makeframe(): - frame = dlisio.plumbing.Frame() - frame.name = 'MAINFRAME' - frame.origin = 0 - frame.copynumber = 0 - - time0 = dlisio.plumbing.Channel() - time0.name = 'TIME' - time0.origin = 0 - time0.copynumber = 0 - attic = { - 'DIMENSION': [1], - 'REPRESENTATION-CODE' : [2] # f4 - } - time0.attic = attic - - tdep = dlisio.plumbing.Channel() - tdep.name = 'TDEP' - tdep.origin = 0 - tdep.copynumber = 0 - attic = { - 'DIMENSION': [2], - 'REPRESENTATION-CODE' : [13] # i2 - } - tdep.attic = attic - - time1 = dlisio.plumbing.Channel() - time1.name = 'TIME' - time1.origin = 1 - time1.copynumber = 0 - attic = { - 'DIMENSION' : [1], - 'REPRESENTATION-CODE' : [13], # i2 - } - time1.attic = attic - - #frame.channels = [time0, tdep, time1] - frame.attic = { - 'CHANNELS' : [core.obname(time0.origin, time0.copynumber, time0.name), - core.obname(tdep.origin, tdep.copynumber, tdep.name), - core.obname(time1.origin, time1.copynumber, time1.name)] - } - - logicalfile = dlisio.dlis(None, [], [], []) - logicalfile.indexedobjects['FRAME'] = { frame.fingerprint : frame } - logicalfile.indexedobjects['CHANNEL'] = { - time0.fingerprint : time0, - tdep.fingerprint : tdep, - time1.fingerprint : time1, - } - for objs in logicalfile.indexedobjects.values(): - for obj in objs.values(): - obj.logicalfile = logicalfile - - frame.link() - return frame - def test_curves_are_copy(f): # All channel.curves() really does is to slice the full frame array # returned by frame.curves(). Make sure the returned slice is a copy not a @@ -219,20 +162,21 @@ def test_dimensions_in_multifdata(): np.testing.assert_array_equal(curves[1][1], [[7, 8, 9], [10, 11, 12]]) def test_duplicated_mnemonics_get_unique_labels(): - frame = makeframe() - assert 'ifDDD' == frame.fmtstr() - dtype = frame.dtype() + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "MAINFRAME") + assert 'ifDDD' == frame.fmtstr() + dtype = frame.dtype() - assert ('FRAMENO', 'TIME.0.0', 'TDEP', 'TIME.1.0') == dtype.names + assert ('FRAMENO', 'TIME.0.0', 'TDEP', 'TIME.1.0') == dtype.names - fields = [ - 'FRAMENO', - frame.channels[0].fingerprint, - frame.channels[1].fingerprint, - frame.channels[2].fingerprint, - ] + fields = [ + 'FRAMENO', + frame.channels[0].fingerprint, + frame.channels[1].fingerprint, + frame.channels[2].fingerprint, + ] - assert all(x in dtype.fields for x in fields) + assert all(x in dtype.fields for x in fields) def test_duplicated_mnemonics_dtype_supports_buffer_protocol(): @@ -249,24 +193,21 @@ def test_duplicated_mnemonics_dtype_supports_buffer_protocol(): # the IDENT type, but dlisio imposes no such restriction) # # https://github.com/equinor/dlisio/pull/97 - frame = makeframe() - _ = memoryview(np.zeros(1, dtype = frame.dtype())) - -def test_duplicated_signatures(f, assert_log): - frame = f.object('FRAME', 'FRAME1') - - frame.channels[1].name = frame.channels[0].name - frame.channels[1].origin = frame.channels[0].origin - frame.channels[1].copynumber = frame.channels[0].copynumber + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "MAINFRAME") + _ = memoryview(np.zeros(1, dtype = frame.dtype())) - with pytest.raises(Exception): - _ = frame.curves() - assert_log("duplicated mnemonics") +def test_duplicated_signatures(assert_log): + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "DUPLICATED") + with pytest.raises(Exception): + _ = frame.curves() + assert_log("duplicated mnemonics") - curves = frame.curves(strict=False) - names = curves.dtype.names + curves = frame.curves(strict=False) + names = curves.dtype.names - assert names == ('FRAMENO', 'CHANN1.10.0(0)', 'CHANN1.10.0(1)') + assert names == ('FRAMENO', 'DUPL.0.0(0)', 'DUPL.0.0(1)') def test_mkunique(): types = [ @@ -289,35 +230,38 @@ def test_mkunique(): def test_channel_order(): - frame = makeframe() + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "MAINFRAME") - ref = [("TIME", 0), ("TDEP", 0), ("TIME", 1)] + ref = [("TIME", 0), ("TDEP", 0), ("TIME", 1)] - for i, ch in enumerate(frame.channels): - assert ch.name == ref[i][0] - assert ch.origin == ref[i][1] + for i, ch in enumerate(frame.channels): + assert ch.name == ref[i][0] + assert ch.origin == ref[i][1] def test_dtype(): - frame = makeframe() + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "MAINFRAME") - dtype = np.dtype([ - ('FRAMENO', np.int32), - ((frame.channels[0].fingerprint, 'TIME.0.0'), np.float32), - ((frame.channels[1].fingerprint, 'TDEP'), np.int16, (2,)), - ((frame.channels[2].fingerprint, 'TIME.1.0'), np.int16), - ]) + dtype = np.dtype([ + ('FRAMENO', np.int32), + ((frame.channels[0].fingerprint, 'TIME.0.0'), np.float32), + ((frame.channels[1].fingerprint, 'TDEP'), np.int16, (2,)), + ((frame.channels[2].fingerprint, 'TIME.1.0'), np.int16), + ]) - assert frame.dtype() == dtype + assert frame.dtype() == dtype def test_dtype_fmt_instance(): - frame = makeframe() - frame.dtype_fmt = 'x-{:s} {:d}~{:d}' + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "MAINFRAME") + frame.dtype_fmt = 'x-{:s} {:d}~{:d}' - # fmtstr is unchanged - assert 'ifDDD' == frame.fmtstr() - expected_names = ('FRAMENO', 'x-TIME 0~0', 'TDEP', 'x-TIME 1~0') - assert expected_names == frame.dtype().names + # fmtstr is unchanged + assert 'ifDDD' == frame.fmtstr() + expected_names = ('FRAMENO', 'x-TIME 0~0', 'TDEP', 'x-TIME 1~0') + assert expected_names == frame.dtype().names def test_dtype_fmt_class(): original = dlisio.plumbing.Frame.dtype_format @@ -325,10 +269,11 @@ def test_dtype_fmt_class(): try: # change dtype before the object itself is constructed, so it dlisio.plumbing.Frame.dtype_format = 'x-{:s} {:d}~{:d}' - frame = makeframe() - expected_names = ('FRAMENO', 'x-TIME 0~0', 'TDEP', 'x-TIME 1~0') - assert expected_names == frame.dtype().names - assert 'ifDDD' == frame.fmtstr() + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "MAINFRAME") + expected_names = ('FRAMENO', 'x-TIME 0~0', 'TDEP', 'x-TIME 1~0') + assert expected_names == frame.dtype().names + assert 'ifDDD' == frame.fmtstr() finally: # even if the test fails, make sure the format string is reset to its @@ -340,12 +285,12 @@ def test_dtype_fmt_class(): ("x-{:s}.{:d}.{:d}.{:d}"), ]) def test_dtype_wrong_fmt(fmt, assert_log): - frame = makeframe() - - frame.dtype_fmt = fmt - with pytest.raises(Exception): - _ = frame.dtype().names - assert_log("rich label") + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "MAINFRAME") + frame.dtype_fmt = fmt + with pytest.raises(Exception): + _ = frame.dtype().names + assert_log("rich label") def test_channel_curves(): @@ -362,138 +307,55 @@ def test_channel_curves(): frame_curves = load_curves(fpath) assert frame_curves['CH22'] == curves22 -def test_channel_curves_duplicated_mnemonics(f): - frame = f.object('FRAME', 'FRAME1') - frame.channels[1].name = frame.channels[0].name - frame.channels[1].copynumber = frame.channels[0].copynumber+ 1 - - ch = frame.channels[0] - curve = ch.curves() +def test_channel_curves_duplicated_mnemonics(): + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "MAINFRAME") + channel = f.object("CHANNEL", "TIME", 0, 0) + curve = channel.curves() - np.testing.assert_array_equal(curve, frame.curves()[ch.fingerprint]) + np.testing.assert_array_equal(curve, + frame.curves()[channel.fingerprint]) def test_channel_without_frame(assert_info): - channel = dlisio.plumbing.Channel() - assert channel.curves() == None - assert_info('no recorded curve-data') + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + channel = f.object("CHANNEL", "BELONG_TO_NO_FRAME") + assert channel.curves() == None + assert_info('no recorded curve-data') - assert channel.frame == None - assert_info('does not belong') + assert channel.frame == None + assert_info('does not belong') def test_channel_fmt(): - ch1 = dlisio.plumbing.Channel() - ch1.name = 'ch1' - ch1.origin = 0 - ch1.copynumber = 0 - ch1.attic = { - 'DIMENSION': [5], - 'REPRESENTATION-CODE': [11], - } - - ch2 = dlisio.plumbing.Channel() - ch2.name = 'ch2' - ch2.origin = 0 - ch2.copynumber = 0 - ch2.attic = { - 'DIMENSION': [2, 2], - 'REPRESENTATION-CODE': [3], - } - - ch3 = dlisio.plumbing.Channel() - ch3.name = 'ch3' - ch3.origin = 0 - ch3.copynumber = 0 - ch3.attic = { - 'DIMENSION': [4, 2], - 'REPRESENTATION-CODE': [26], - } - - ch4 = dlisio.plumbing.Channel() - ch4.name = 'ch4' - ch4.origin = 0 - ch4.copynumber = 0 - ch4.attic = { - 'DIMENSION': [1], - 'REPRESENTATION-CODE': [17], - } - - ch5 = dlisio.plumbing.Channel() - ch5.name = 'ch5' - ch5.origin = 0 - ch5.copynumber = 0 - ch5.attic = { - 'DIMENSION': [2, 3, 1], - 'REPRESENTATION-CODE': [12], - } - - frame = dlisio.plumbing.Frame() - frame.name = 'fr' - frame.origin = 0 - frame.copynumber = 0 - frame.attic = { - 'CHANNELS': [ - core.obname(ch1.origin, ch1.copynumber, ch1.name), - core.obname(ch2.origin, ch2.copynumber, ch2.name), - core.obname(ch3.origin, ch3.copynumber, ch3.name), - core.obname(ch4.origin, ch4.copynumber, ch4.name), - core.obname(ch5.origin, ch5.copynumber, ch5.name), - ] - } - - logicalfile = dlisio.dlis(None, [], [], []) - logicalfile.indexedobjects['FRAME'] = { - frame.fingerprint: frame - } - - logicalfile.indexedobjects['CHANNEL'] = { - ch1.fingerprint: ch1, - ch2.fingerprint: ch2, - ch3.fingerprint: ch3, - ch4.fingerprint: ch4, - ch5.fingerprint: ch5, - } - frame.logicalfile = logicalfile - ch1.logicalfile = logicalfile - ch2.logicalfile = logicalfile - ch3.logicalfile = logicalfile - ch4.logicalfile = logicalfile - ch5.logicalfile = logicalfile - - pre_fmt, ch_fmt, post_fmt = frame.fmtstrchannel(ch3) - assert pre_fmt == "CCCCCbbbb" - assert ch_fmt == "qqqqqqqq" - assert post_fmt == "Ldddddd" - + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "VARIOUS") + channel = f.object("CHANNEL", "chn3") + pre_fmt, ch_fmt, post_fmt = frame.fmtstrchannel(channel) + assert pre_fmt == "CCCCCbbbb" + assert ch_fmt == "qqqqqqqq" + assert post_fmt == "Ldddddd" def test_channel_no_dimension(assert_log): - ch = dlisio.plumbing.Channel() - ch.name = 'CH' - ch.origin = 0 - ch.copynumber = 0 - ch.attic = {'REPRESENTATION-CODE': [17]} - - with pytest.raises(ValueError) as exc: - ch.fmtstr() - assert "channel.dimension is invalid" in str(exc.value) - - ch.attic['DIMENSION'] = [1] - assert ch.fmtstr() == "L" + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + ch = f.object("CHANNEL", "NODIM") + with pytest.raises(ValueError) as exc: + ch.fmtstr() + assert "channel.dimension is invalid" in str(exc.value) + ch.attic['DIMENSION'] = [1] + assert ch.fmtstr() == "L" def test_frame_index(): - frame = makeframe() - frame.attic['INDEX-TYPE'] = ['DECREASING'] - - assert frame.index == frame.channels[0].name + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "MAINFRAME") + assert frame.index == frame.channels[0].name def test_frame_index_absent(assert_info): - frame = makeframe() - assert frame.index == 'FRAMENO' - -def test_frame_index_absent_nochannels(assert_info): - frame = dlisio.plumbing.Frame() - frame.attic['INDEX-TYPE'] = ['DECREASING'] - - assert frame.index is None - assert_info('Frame has no channels') - + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "NONINDEXED") + assert frame.index == 'FRAMENO' + +def test_frame_index_nochannels(assert_info): + with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): + frame = f.object("FRAME", "INDEXED_NO_CHANNELS") + assert frame.index is None + assert_info('Frame has no channels') diff --git a/python/tests/test_logical_file.py b/python/tests/test_logical_file.py index 323660148..5479ba351 100644 --- a/python/tests/test_logical_file.py +++ b/python/tests/test_logical_file.py @@ -11,173 +11,113 @@ from dlisio import core -@pytest.fixture(scope="module") -def g(): - s = dlisio.open("tests/test_logical_file.py") #any existing file is required - g = dlisio.dlis(s, [], [], []) - - ch = Channel() - ch.name = 'CHANNEL1' - ch.origin = 0 - ch.copynumber = 0 - ch.logicalfile = g - g.indexedobjects["CHANNEL"][ch.fingerprint] = ch - - ch = Channel() - ch.name = 'CHANNEL1.V2' - ch.origin = 0 - ch.copynumber = 0 - ch.logicalfile = g - g.indexedobjects["CHANNEL"][ch.fingerprint] = ch - - ch = Channel() - ch.name = 'CHANNEL1' - ch.origin = 0 - ch.copynumber = 1 - ch.logicalfile = g - g.indexedobjects["CHANNEL"][ch.fingerprint] = ch - - un = Unknown() - un.name = 'UNEFRAME' - un.origin = 0 - un.copynumber = 0 - un.type = "NONCHANNEL" - un.logicalfile = g - g.indexedobjects["NONCHANNEL"][un.fingerprint] = un - - fr = Frame() - fr.name = 'UNEFRAME' - fr.origin = 0 - fr.copynumber = 0 - fr.logicalfile = g - g.indexedobjects["FRAME"][fr.fingerprint] = fr - - un = Unknown() - un.name = '440-CHANNEL' - un.origin = 0 - un.copynumber = 0 - un.type = "440.TYPE" - un.logicalfile = g - g.indexedobjects["440.TYPE"][un.fingerprint] = un - - ch = Channel() - ch.name = '440.CHANNEL' - ch.origin = 0 - ch.copynumber = 0 - ch.type = "440-TYPE" - ch.logicalfile = g - g.indexedobjects["440-TYPE"][ch.fingerprint] = ch - - g.record_types = list(g.indexedobjects.keys()) - - # Simulate the occurance of multiple Channel sets - g.record_types.append('CHANNEL') - return g - -def test_object(g): - channel = g.object("CHANNEL", "CHANNEL1", 0, 1) - assert channel.name == "CHANNEL1" - assert channel.origin == 0 - assert channel.copynumber == 1 +def test_object(f): + channel = f.object("CHANNEL", "CHANN1", 10, 0) + assert channel.name == "CHANN1" + assert channel.origin == 10 + assert channel.copynumber == 0 assert channel.type == "CHANNEL" -def test_object_unknown(g): - channel = g.object("NONCHANNEL", "UNEFRAME", 0, 0) - assert channel.name == "UNEFRAME" - assert channel.origin == 0 +def test_object_unknown(f): + channel = f.object("UNKNOWN_SET", "OBJ1", 10, 0) + assert channel.name == "OBJ1" + assert channel.origin == 10 assert channel.copynumber == 0 - assert channel.type == "NONCHANNEL" + assert channel.type == "UNKNOWN_SET" -def test_object_nonexisting(g): +def test_object_nonexisting(f): with pytest.raises(ValueError) as exc: - _ = g.object("UNKNOWN_TYPE", "SOME_OBJECT", 0, 0) + _ = f.object("UNKNOWN_TYPE", "SOME_OBJECT", 0, 0) assert "not found" in str(exc.value) with pytest.raises(ValueError): - _ = g.object("CHANNEL", "CHANNEL1", 11, 0) + _ = f.object("CHANNEL", "CHANN1", 11, 0) with pytest.raises(TypeError): - _ = g.object("WEIRD", "CHANNEL1", "-1", "-1") + _ = f.object("WEIRD", "CHANN1", "-1", "-1") -def test_object_solo_nameonly(g): - channel = g.object("CHANNEL", "CHANNEL1.V2") - assert channel.name == "CHANNEL1.V2" - assert channel.origin == 0 +def test_object_solo_nameonly(f): + channel = f.object("CHANNEL", "CHANN2") + assert channel.name == "CHANN2" + assert channel.origin == 10 assert channel.copynumber == 0 - assert channel.type == "CHANNEL" + assert channel.type == "CHANNEL" -def test_object_nonexisting_nameonly(g): +def test_object_nonexisting_nameonly(f): with pytest.raises(ValueError) as exc: - _ = g.object("CHANNEL", "NOTFOUND") + _ = f.object("CHANNEL", "NOTFOUND") assert "No objects" in str(exc.value) -def test_object_many_objects_nameonly(g): - with pytest.raises(ValueError) as exc: - _ = g.object("CHANNEL", "CHANNEL1") - assert "There are multiple" in str(exc.value) - -def test_match(g): - refs = [] - refs.append( g.object('CHANNEL', 'CHANNEL1', 0, 0) ) - refs.append( g.object('CHANNEL', 'CHANNEL1.V2', 0, 0) ) - refs.append( g.object('CHANNEL', 'CHANNEL1', 0, 1) ) +def test_object_many_objects_nameonly(tmpdir_factory, merge_files_manyLR): + with dlisio.load("data/chap4-7/match.dlis") as (f, *_): + with pytest.raises(ValueError) as exc: + _ = f.object("CHANNEL", "MATCH1") + assert "There are multiple" in str(exc.value) - channels = g.match('.*chan.*') +def test_match(): + with dlisio.load("data/chap4-7/match.dlis") as (f, *_): + refs = [] + refs.append(f.object('CHANNEL', 'MATCH1', 16, 0)) + refs.append(f.object('CHANNEL', 'MATCH111', 16, 0)) + refs.append(f.object('CHANNEL', 'MATCH1', 127, 0)) - assert len(list(channels)) == 3 - for ch in channels: - assert ch in refs + channels = f.match('.*match1.*') - channels = g.match('.*chan.*', ".*") - assert len(list(channels)) == 5 + assert len(list(channels)) == 3 + for ch in channels: + assert ch in refs -def test_match_type(g): - refs = [] + channels = f.match('.*match1.*', ".*") + assert len(list(channels)) == 5 - refs.append( g.object('NONCHANNEL', 'UNEFRAME', 0, 0) ) - refs.append( g.object('FRAME', 'UNEFRAME', 0, 0) ) +def test_match_type(): + with dlisio.load("data/chap4-7/match.dlis") as (f, *_): + refs = [] + refs.append( f.object('MATCH', 'MATCH22', 16, 0) ) + refs.append( f.object('FRAME', 'MATCH22', 16, 0) ) - objs = g.match('UNEFR.*', type='NONCHANNEL|FRAME') + objs = f.match('MATCH2.*', type='MATCH|FRAME') - assert len(list(objs)) == len(refs) - for obj in objs: - assert obj in refs + assert len(list(objs)) == len(refs) + for obj in objs: + assert obj in refs - objs = g.match('', type='NONCHANNEL|frame') + objs = f.match('', type='MATCH|frame') - assert len(list(objs)) == len(refs) - for obj in objs: - assert obj in refs + assert len(list(objs)) == len(refs) + for obj in objs: + assert obj in refs -def test_match_invalid_regex(g): +def test_match_invalid_regex(f): with pytest.raises(ValueError): - _ = next(g.match('*')) + _ = next(f.match('*')) with pytest.raises(ValueError): - _ = next(g.match('AIBK', type='*')) - -def test_match_special_characters(g): - o1 = g.object('440.TYPE', '440-CHANNEL', 0, 0) - o2 = g.object('440-TYPE', '440.CHANNEL', 0, 0) - - refs = [o1, o2] - channels = g.match('440.CHANNEL', '440.TYPE') - - assert len(list(channels)) == 2 - for ch in channels: - assert ch in refs - - refs = [o1] - channels = g.match('440-CHANNEL', '440.TYPE') - assert len(list(channels)) == 1 - for ch in channels: - assert ch in refs - - refs = [o2] - channels = g.match('440.CHANNEL', '440-TYPE') - assert len(list(channels)) == 1 - for ch in channels: - assert ch in refs + _ = next(f.match('AIBK', type='*')) + +def test_match_special_characters(): + with dlisio.load("data/chap4-7/match.dlis") as (f, *_): + o1 = f.object('440.TYPE', '440-MATCH1', 16, 0) + o2 = f.object('440-TYPE', '440.MATCH1', 16, 0) + + refs = [o1, o2] + channels = f.match('440.MATCH1', '440.TYPE') + + assert len(list(channels)) == 2 + for ch in channels: + assert ch in refs + + refs = [o1] + channels = f.match('440-MATCH1', '440.TYPE') + assert len(list(channels)) == 1 + for ch in channels: + assert ch in refs + + refs = [o2] + channels = f.match('440.MATCH1', '440-TYPE') + assert len(list(channels)) == 1 + for ch in channels: + assert ch in refs def test_indexedobjects(f): assert f.fileheader.name == "N" From 47f73d6b3cf31b663f8d49bd49144c58aaae4793 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Thu, 9 Jul 2020 15:29:54 +0200 Subject: [PATCH 06/27] Use dl:basic_object as attic for Python types This is a small step in a larger quest of moving the pool of metadata objects to c++ which will give a performance boost. There is a nice side-effect to this change: Because attic is now a c++ type, the values of object-attributes are not casted to python types until the attribute is access. Thus UnicodeErrors that were previously emitted on object construction is now postpone until the attribute is accessed. --- python/dlisio/__init__.py | 6 ++-- python/dlisio/ext/core.cpp | 41 +++++++++++++++++---------- python/dlisio/plumbing/basicobject.py | 8 +++--- python/dlisio/plumbing/frame.py | 7 +++-- python/dlisio/plumbing/wellref.py | 12 ++++++-- python/tests/test_basic_object.py | 18 ++++-------- python/tests/test_curves.py | 3 -- python/tests/test_encodings.py | 3 +- 8 files changed, 56 insertions(+), 42 deletions(-) diff --git a/python/dlisio/__init__.py b/python/dlisio/__init__.py index c41b91d9c..a5b03d9ab 100644 --- a/python/dlisio/__init__.py +++ b/python/dlisio/__init__.py @@ -622,13 +622,13 @@ def load(self, records=None, reload=True): for os in sets: # TODO: handle replacement sets - for name, o in os.objects.items(): + for o in os.objects: try: - obj = self.types[os.type](o, name = name, lf = self) + obj = self.types[os.type](o, name = o.name, lf = self) except KeyError: obj = plumbing.Unknown( o, - name = name, + name = o.name, type = os.type, lf = self ) diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index 6d1ff37c3..d618cc10f 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -816,24 +816,35 @@ PYBIND11_MODULE(core, m) { }) ; + py::class_< dl::basic_object >( m, "basic_object" ) + .def_readonly("type", &dl::basic_object::type) + .def_readonly("name", &dl::basic_object::object_name) + .def( "__len__", []( const dl::basic_object& o ) { + return o.attributes.size(); + }) + .def( "__eq__", &dl::basic_object::operator == ) + .def( "__ne__", &dl::basic_object::operator != ) + .def( "__getitem__", []( dl::basic_object& o, const std::string& key ) { + try { return o.at(key).value; } + catch (const std::out_of_range& e) { throw py::key_error( e.what() ); } + }) + .def( "__repr__", []( const dl::basic_object& o ) { + return "dlisio.core.basic_object(name={})"_s + .format(o.object_name); + }) + .def( "keys", []( const dl::basic_object& o ){ + std::vector< dl::ident > keys; + for ( auto attr : o.attributes ) { + keys.push_back( attr.label ); + } + return keys; + }) + ; + py::class_< dl::object_set >( m, "object_set" ) .def_readonly( "type", &dl::object_set::type ) .def_readonly( "name", &dl::object_set::name ) - .def_property_readonly("objects", - [](const dl::object_set& object_set) { - py::dict objects; - for (const auto& object : object_set.objects) { - py::dict obj; - for (const auto& attr : object.attributes) { - auto label = py::str(dl::decay(attr.label)); - // TODO: need units? So far they're not used - obj[label] = dl::decay(attr.value); - } - const auto& name = dl::decay(object.object_name); - objects[py::cast(name)] = obj; - } - return objects; - }) + .def_readonly( "objects", &dl::object_set::objects ) ; py::enum_< dl::representation_code >( m, "reprc" ) diff --git a/python/dlisio/plumbing/basicobject.py b/python/dlisio/plumbing/basicobject.py index 1fd4abc92..b188484b3 100644 --- a/python/dlisio/plumbing/basicobject.py +++ b/python/dlisio/plumbing/basicobject.py @@ -196,7 +196,7 @@ def __getitem__(self, key): Returns a default value for missing attributes. I.e. attributes defined in :attr:`attributes` but are not in :attr:`attic`. """ - if key not in self.attributes and key not in self.attic: + if key not in self.attributes and key not in self.attic.keys(): raise KeyError("'{}'".format(key)) try: @@ -253,9 +253,9 @@ def stash(self): all attributes not defined in :attr:`attributes` """ stash = { - key : value - for key, value - in self.attic.items() + key : self.attic[key] + for key + in self.attic.keys() if key not in self.attributes } diff --git a/python/dlisio/plumbing/frame.py b/python/dlisio/plumbing/frame.py index fa69d904b..93520bacb 100644 --- a/python/dlisio/plumbing/frame.py +++ b/python/dlisio/plumbing/frame.py @@ -123,8 +123,11 @@ def spacing(self): @property def encrypted(self): - if 'ENCRYPTED' in self.attic: return True - else: return False + try: + _ = self.attic['ENCRYPTED'] + return True + except KeyError: + return False @property def index_min(self): diff --git a/python/dlisio/plumbing/wellref.py b/python/dlisio/plumbing/wellref.py index b30ba4a51..2b3334266 100644 --- a/python/dlisio/plumbing/wellref.py +++ b/python/dlisio/plumbing/wellref.py @@ -94,8 +94,16 @@ def coordinates(self): value = 'COORDINATE-{}-VALUE' for i in range(1, 4): - key = self.attic.get(name.format(i), [custom_label.format(i)])[0] - val = self.attic.get(value.format(i), [None])[0] + try: + key = self.attic[name.format(i)][0] + except KeyError: + key = custom_label.format(i) + + try: + val = self.attic[value.format(i)][0] + except KeyError: + val = None + coordinates[key] = val return coordinates diff --git a/python/tests/test_basic_object.py b/python/tests/test_basic_object.py index 8c41b828d..1d5cb22e1 100644 --- a/python/tests/test_basic_object.py +++ b/python/tests/test_basic_object.py @@ -19,19 +19,13 @@ def test_getitem_defaultvalue(f): assert obj['INDEX-TYPE'] is None def test_getitem_unexpected_attr(f): - obj = f.object('FRAME', 'FRAME2') - - try: - obj.attic['NEW-ATTR'] = [1] - - # Attributes unknown to dlisio, such as 'NEW-ATTR' should be reachable - # through __getitem__ - assert obj['NEW-ATTR'] == [1] + obj = f.object('CALIBRATION-COEFFICIENT', 'COEFF_BAD') + # Attributes unknown to dlisio, such as 'NEW-ATTR' should be reachable + # through __getitem__ + assert obj['LNKS'] == [18, 32] - # Should also be in stash - assert obj.stash['NEW-ATTR'] == [1] - finally: - del obj.attic['NEW-ATTR'] + # Should also be in stash + assert obj.stash['LNKS'] == [18, 32] def test_getitem_noattribute(f): obj = f.object('FRAME', 'FRAME2') diff --git a/python/tests/test_curves.py b/python/tests/test_curves.py index 72e6002e2..c88600627 100644 --- a/python/tests/test_curves.py +++ b/python/tests/test_curves.py @@ -341,9 +341,6 @@ def test_channel_no_dimension(assert_log): ch.fmtstr() assert "channel.dimension is invalid" in str(exc.value) - ch.attic['DIMENSION'] = [1] - assert ch.fmtstr() == "L" - def test_frame_index(): with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): frame = f.object("FRAME", "MAINFRAME") diff --git a/python/tests/test_encodings.py b/python/tests/test_encodings.py index 35e060a97..14bf7137b 100644 --- a/python/tests/test_encodings.py +++ b/python/tests/test_encodings.py @@ -46,7 +46,8 @@ def test_broken_utf8_value(tmpdir, merge_files_oneLR): dlisio.set_encodings([]) with pytest.warns(UnicodeWarning): with dlisio.load(path) as (f, *_): - f.load() + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT') + _ = obj['DEFAULT_ATTRIBUTE'] try: dlisio.set_encodings(['koi8_r']) with dlisio.load(path) as (f, *_): From 4153edfa116a3a951beb48b627e37bfa2493bcac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Thu, 9 Jul 2020 17:58:56 +0200 Subject: [PATCH 07/27] Move the channel -> frame referencing out of load Creating a back-reference from channels to their parent frame is rather awkward. Ideally we would completely remove it, but that would require some notable changes to the api. So it stays for now. The c++ api doesn't currently take part in these back-references between objects - and we want it to stay that way. The loading routine will gradually be moved to c++, hence the referencing is moved so that the porting doesn't require c++ to do back-referencing. --- python/dlisio/__init__.py | 7 ------- python/dlisio/plumbing/channel.py | 26 +++++++++++++++++++------- python/dlisio/plumbing/frame.py | 16 ---------------- python/dlisio/plumbing/linkage.py | 10 ++++++++++ python/tests/test_curves.py | 3 +++ 5 files changed, 32 insertions(+), 30 deletions(-) diff --git a/python/dlisio/__init__.py b/python/dlisio/__init__.py index a5b03d9ab..c9cc03962 100644 --- a/python/dlisio/__init__.py +++ b/python/dlisio/__init__.py @@ -651,13 +651,6 @@ def load(self, records=None, reload=True): self.indexedobjects = indexedobjects self.problematic = problematic - if 'FRAME' not in [x.type for x in sets]: return self - - # Frame objects need the additional step of updating its Channels with - # a reference back to itself. See Frame.link() - for obj in self.indexedobjects['FRAME'].values(): - obj.link() - return self def storage_label(self): diff --git a/python/dlisio/plumbing/channel.py b/python/dlisio/plumbing/channel.py index 8a149dc7d..72e8cb0a5 100644 --- a/python/dlisio/plumbing/channel.py +++ b/python/dlisio/plumbing/channel.py @@ -86,16 +86,28 @@ class Channel(BasicObject): def __init__(self, obj = None, name = None, lf=None): super().__init__(obj, name = name, type = 'CHANNEL', lf=lf) - # The numpy data type of the sample array - self._frame = None @property def frame(self): - if self._frame is not None: - return lookup(self.logicalfile, obname('FRAME'), self._frame) + if self.logicalfile is None: + msg = 'Unable to lookup frame, {} has no logical file' + logging.info(msg.format(self)) + return None + + # Find the frame(s) that are claiming ownership over this channel + frames = findframe(self.fingerprint, self.logicalfile) + + if len(frames) == 1: + return lookup(self.logicalfile, obname('FRAME'), frames[0]) + + if len(frames) == 0: + msg = '{} does not belong to any Frame' + logging.info(msg.format(self)) + + if len(frames) > 1: + msg = '{} belong to multiple frames. Candidates are {}' + logging.info(msg.format(self, frames)) - msg = '{} does not belong to any Frame' - logging.info(msg.format(self)) return None @property @@ -221,7 +233,7 @@ def curves(self): >>> curve[0][1][2] 6 """ - if self._frame is not None: + if self.frame is not None: return np.copy(self.frame.curves()[self.fingerprint]) msg = 'There is no recorded curve-data for {}' diff --git a/python/dlisio/plumbing/frame.py b/python/dlisio/plumbing/frame.py index 93520bacb..2a9288044 100644 --- a/python/dlisio/plumbing/frame.py +++ b/python/dlisio/plumbing/frame.py @@ -483,22 +483,6 @@ def fmtstrchannel(self, channel): return pre_fmt, ch_fmt, post_fmt - def link(self): - # Reference from a Channel to the Frame it belongs to is not explicitly - # present in file. However it is very convenient that Channels are - # aware of their parent frame. Without a this reference present in the - # file, its the Frame's responsibility to update all it's Channel with - # a reference back to itself. - for ch in self.channels: - try: - if ch._frame: - msg = '{} already belongs to {}, ownership given to {}' - logging.warning(msg.format(ch, ch.frame, self)) - ch._frame = core.obname(self.origin, self.copynumber, self.name) - except AttributeError: - #happens if ch has been parsed as other type - pass - def describe_attr(self, buf, width=80, indent='', exclude=''): if len(self.channels) > 0: if self.index_type is not None: diff --git a/python/dlisio/plumbing/linkage.py b/python/dlisio/plumbing/linkage.py index 864ff346d..dde03934e 100644 --- a/python/dlisio/plumbing/linkage.py +++ b/python/dlisio/plumbing/linkage.py @@ -35,3 +35,13 @@ def isreference(val): return (isinstance (val, core.obname) or isinstance (val, core.objref) or isinstance (val, core.attref)) + +def findframe(fp, logical_file): + """Find all frames containing the channel""" + frames = [] + for frame in logical_file.frames: + for channel in frame.channels: + if channel.fingerprint != fp: continue + frames.append(frame.attic.name) + + return frames diff --git a/python/tests/test_curves.py b/python/tests/test_curves.py index c88600627..b3b2fa1b0 100644 --- a/python/tests/test_curves.py +++ b/python/tests/test_curves.py @@ -325,6 +325,9 @@ def test_channel_without_frame(assert_info): assert channel.frame == None assert_info('does not belong') + assert channel.curves() == None + assert_info('no recorded curve-data') + def test_channel_fmt(): with dlisio.load("data/chap4-7/frame-channels.dlis") as (f, *_): frame = f.object("FRAME", "VARIOUS") From 6c6d6f05cd3ff7954fcb6a0138087e49e4362adb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Thu, 9 Jul 2020 18:52:14 +0200 Subject: [PATCH 08/27] Make dl::object_set a self-parsing type Experience with the python implementation of parsing objects on-demand has shown that the complexity of bookkeeping the state of each object-set quickly becomes complicated. To combat some of that complexity, the object-parsing is moved to the set itself. The state of the set (i.e. parsed or unparsed) then becomes an implementation detail of the set. Thus freeing any logical file implementation of bookkeeping the state of all its object-sets. --- lib/extension/dlisio/ext/types.hpp | 46 +++++++++++++++++++++++---- lib/src/parse.cpp | 51 ++++++++++++++++++++++++------ python/dlisio/__init__.py | 2 +- python/dlisio/ext/core.cpp | 15 +++++---- 4 files changed, 91 insertions(+), 23 deletions(-) diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index 3bd496cc8..ee0f7904a 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -463,21 +463,55 @@ struct basic_object { std::vector< object_attribute > attributes; }; -/* - * The object set, after parsing, is an (unordered?) collection of objects. In - * parsing, more information is added through creating custom types, but the - * fundamental restriction is one type per set. +/* Object set + * + * The object SET, as defined by rp66v1 chapter 3.2.1 is a series of objects - + * all derived from the same object template, all of the same type. + * + * Because the pieces of information making up the objects are mostly variable + * size there is random access to get specific objects. For the same reason + * there is no way of making an index of the objects without parsing the full + * set. Hence the entire set need to be parsed and cached in one go. + * + * However, parsing a lot of sets are expensive and often unnecessary. Too + * avoid the upfront cost of parsing, object_set is a self parsing type. I.e. + * it is initialized with a buffer of raw bytes making up the set - which is + * comparably much cheaper to extract than the actual parsing. The parsing is + * considered an implementation detail of the class and will be postponed until + * the first outside query for objects. + * + * Typical object queries will revolve around the type of object - hence + * parsing the set type (and name) independently of the rest of the set makes + * sense. + * + * Caching the raw bytes on the object also makes it independent of IO. * - * The variant-of-vectors is wells suited for this + * Encrypted Records: + * + * encrypted records cannot be parsed by dlisio without being decrypted first. + * As object_set does its parsing itself, it _will_ fail on construction if + * given an encrypted record. */ using object_vector = std::vector< basic_object >; struct object_set { +public: + explicit object_set( std::vector< char > b ) noexcept (false); + int role; // TODO: enum class? dl::ident type; dl::ident name; + + void parse() noexcept (false); + bool isparsed() const noexcept (true); + + dl::object_vector& objects() noexcept (false); +private: + std::vector< char > buffer; + dl::object_vector objs; dl::object_template tmpl; - dl::object_vector objects; + + bool parsed = false; }; const char* parse_template( const char* begin, diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index 4cd5894f6..9636997f9 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include @@ -674,6 +675,7 @@ bool basic_object::operator != (const basic_object& o) const noexcept (true) { return !(*this == o); } + const char* parse_template( const char* cur, const char* end, object_template& out ) noexcept (false) { @@ -989,19 +991,48 @@ const char* parse_set_component( const char* cur, return cur; } -object_set parse_objects( const char* cur, const char* end ) { - object_set set; - cur = parse_set_component( cur, end, &set.type, &set.name, &set.role); - cur = parse_template( cur, end, set.tmpl ); +object_set::object_set(std::vector< char > b) noexcept (false) { + parse_set_component(b.data(), + b.data() + b.size(), + &this->type, + &this->name, + &this->role); + this->buffer = std::move(b); +} + +void object_set::parse() noexcept (false) { + if (this->isparsed()) return; - /* - Return if set has no objects - */ + const char* beg = this->buffer.data(); + const char* end = beg + this->buffer.size(); + + /* Skip past the set component as it's already been read and parsed */ + auto cur = parse_set_component(beg, end, nullptr, nullptr, nullptr); + + object_template tmpl; + cur = parse_template(cur, end, tmpl); + + // There are no objects in the set if (std::distance( cur, end ) == 0) - return set; + return; + + auto objs = parse_objects(tmpl, this->type, cur, end); + + this->tmpl = tmpl; + this->objs = objs; + + this->parsed = true; +} + +bool object_set::isparsed() const noexcept (true) { + return this->parsed; +} + +dl::object_vector& object_set::objects() noexcept (false) { + if (not this->isparsed()) + this->parse(); - set.objects = parse_objects( set.tmpl, set.type, cur, end ); - return set; + return this->objs; } } diff --git a/python/dlisio/__init__.py b/python/dlisio/__init__.py index c9cc03962..d8aede675 100644 --- a/python/dlisio/__init__.py +++ b/python/dlisio/__init__.py @@ -622,7 +622,7 @@ def load(self, records=None, reload=True): for os in sets: # TODO: handle replacement sets - for o in os.objects: + for o in os.objects(): try: obj = self.types[os.type](o, name = o.name, lf = self) except KeyError: diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index d618cc10f..56b0ebbc9 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -690,6 +690,8 @@ noexcept (false) { } +PYBIND11_MAKE_OPAQUE( std::vector< dl::object_set > ) + PYBIND11_MODULE(core, m) { PyDateTime_IMPORT; @@ -705,6 +707,8 @@ PYBIND11_MODULE(core, m) { } }); + py::bind_vector>(m, "list(object_set)"); + m.def("open", &dl::open, py::arg("path"), py::arg("zero") = 0); m.def("open_rp66", &dl::open_rp66); m.def("open_tif", &dl::open_tapeimage); @@ -842,9 +846,9 @@ PYBIND11_MODULE(core, m) { ; py::class_< dl::object_set >( m, "object_set" ) - .def_readonly( "type", &dl::object_set::type ) - .def_readonly( "name", &dl::object_set::name ) - .def_readonly( "objects", &dl::object_set::objects ) + .def_readonly( "type", &dl::object_set::type ) + .def_readonly( "name", &dl::object_set::name ) + .def( "objects", &dl::object_set::objects ) ; py::enum_< dl::representation_code >( m, "reprc" ) @@ -963,9 +967,8 @@ PYBIND11_MODULE(core, m) { std::vector< dl::object_set > objects; for (const auto& rec : recs) { if (rec.isencrypted()) continue; - auto begin = rec.data.data(); - auto end = begin + rec.data.size(); - objects.push_back( dl::parse_objects( begin, end ) ); + if (rec.data.size() == 0) continue; + objects.push_back( dl::object_set( rec.data ) ); } return objects; }); From 29f6559fd6c45896480a4aba2fcbd94ef6ce7965 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Fri, 10 Jul 2020 10:07:38 +0200 Subject: [PATCH 09/27] Add dl::object_set::at As the object_set is given a more prominent role, add method at that makes it a bit easier to access objects in the set. --- lib/extension/dlisio/ext/types.hpp | 1 + lib/src/parse.cpp | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index ee0f7904a..a174db4d1 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -505,6 +505,7 @@ struct object_set { void parse() noexcept (false); bool isparsed() const noexcept (true); + dl::basic_object& at(const dl::obname&) noexcept (false); dl::object_vector& objects() noexcept (false); private: std::vector< char > buffer; diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index 9636997f9..9a34b1c3b 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -1035,4 +1035,23 @@ dl::object_vector& object_set::objects() noexcept (false) { return this->objs; } +dl::basic_object& object_set::at(const dl::obname& key) noexcept (false) { + auto eq = [&key]( const dl::basic_object& obj ) { + return dl::decay( obj.object_name ) == key; + }; + + auto objects = this->objects(); + // TODO; handle duplications + auto itr = std::find_if( objects.begin(), + objects.end(), + eq ); + + if (itr == objects.end()) { + const auto msg = "object_set.at: No object with fingerprint {} in set"; + const auto fp = key.fingerprint( dl::decay(this->type) ) ; + throw std::out_of_range( fmt::format(msg, fp)); + } + return *itr; +} + } From e7a16dde90634c2495398aca526f6e9bc30bcfd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Fri, 10 Jul 2020 10:33:46 +0200 Subject: [PATCH 10/27] Single out encrypted records in dl::findoffsets Neither explicit nor implicit encrypted records can be processed by dlisio's parsing routines, hence they are single out straight away and stored separately. This seams cleaner than implicitly dropping them in sub-sequent parsing routines. The checks for encryptions are preserved in the parsing routines in case someone tries to do some cleverness on their own. --- lib/extension/dlisio/ext/io.hpp | 1 + lib/src/io.cpp | 5 +++-- python/dlisio/__init__.py | 7 ++++--- python/dlisio/ext/core.cpp | 2 +- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/lib/extension/dlisio/ext/io.hpp b/lib/extension/dlisio/ext/io.hpp index 96bf185c3..31b49528b 100644 --- a/lib/extension/dlisio/ext/io.hpp +++ b/lib/extension/dlisio/ext/io.hpp @@ -67,6 +67,7 @@ class stream { struct stream_offsets { std::vector< index_entry > explicits; std::vector< index_entry > implicits; + std::vector< index_entry > encrypted; }; stream open(const std::string&, std::int64_t) noexcept (false); diff --git a/lib/src/io.cpp b/lib/src/io.cpp index fd16e3001..92c259897 100644 --- a/lib/src/io.cpp +++ b/lib/src/io.cpp @@ -453,8 +453,9 @@ dl::stream_offsets findoffsets(dl::stream& file) noexcept (false) { if (not attr_consistent( attributes )) entry.consistent = false; if (not type_consistent( types )) entry.consistent = false; - if (entry.isexplicit()) idx.explicits.push_back(entry); - else idx.implicits.push_back(entry); + if (entry.isencrypted()) idx.encrypted.push_back(entry); + else if (entry.isexplicit()) idx.explicits.push_back(entry); + else idx.implicits.push_back(entry); attributes.clear(); types.clear(); diff --git a/python/dlisio/__init__.py b/python/dlisio/__init__.py index d8aede675..4ae305670 100644 --- a/python/dlisio/__init__.py +++ b/python/dlisio/__init__.py @@ -215,11 +215,12 @@ class dlis(object): to link the content of attributes to other objects. """ - def __init__(self, stream, attic, fdata_index, sul=None): + def __init__(self, stream, attic, fdata_index, encrypted, sul=None): self.file = stream self.attic = attic self.sul = sul self.fdata_index = fdata_index + self.encrypted = encrypted self.indexedobjects = defaultdict(dict) self.problematic = [] @@ -783,7 +784,7 @@ def rewind(offset, tif): if tapemarks: stream = core.open_tif(stream) stream = core.open_rp66(stream) - explicits, implicits = core.findoffsets(stream) + explicits, implicits, encrypted = core.findoffsets(stream) hint = rewind(stream.absolute_tell, tapemarks) records = core.extract(stream, [x.tell for x in explicits]) @@ -791,7 +792,7 @@ def rewind(offset, tif): for key, val in core.findfdata(stream, [x.tell for x in implicits]): fdata_index[key].append(val) - lf = dlis(stream, records, fdata_index, sul) + lf = dlis(stream, records, fdata_index, encrypted, sul) lfs.append(lf) try: diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index 56b0ebbc9..8304175fe 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -980,7 +980,7 @@ PYBIND11_MODULE(core, m) { m.def( "findoffsets", []( dl::stream& file ) { const auto ofs = dl::findoffsets( file ); - return py::make_tuple( ofs.explicits, ofs.implicits ); + return py::make_tuple( ofs.explicits, ofs.implicits, ofs.encrypted ); }); m.def("set_encodings", set_encodings); From ebddc00fbc046b8f41126950dd57b70b0a4d6e89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Fri, 10 Jul 2020 10:44:01 +0200 Subject: [PATCH 11/27] Add a queryable pool of metadata object dl::pool is intended to relieve python's dlis object of the responsibility of caching raw record, parsed objects and the tedious task of parsing records at the correct time. The c++ counterpart is conceptually simpler as each object_set are responsible for parsing. That means dl::pool is a simple container with some convenient functions for querying objects. --- lib/extension/dlisio/ext/types.hpp | 14 ++++++++++ lib/src/parse.cpp | 45 ++++++++++++++++++++++++++++++ python/dlisio/ext/core.cpp | 4 +++ 3 files changed, 63 insertions(+) diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index a174db4d1..f7979abb2 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -515,6 +515,20 @@ struct object_set { bool parsed = false; }; +/* A queryable pool of metadata objects */ +class pool { +public: + explicit pool( std::vector< dl::object_set > e ) : eflrs(std::move(e)) {}; + + object_vector match(const std::string& type, + const std::string& name); + + dl::basic_object& at(const dl::ident&, const dl::obname&) noexcept (false); + dl::basic_object& at(const dl::objref&) noexcept(false); +private: + std::vector< dl::object_set > eflrs; +}; + const char* parse_template( const char* begin, const char* end, object_template& ) noexcept (false); diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index 9a34b1c3b..79b35a71e 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include @@ -1054,4 +1055,48 @@ dl::basic_object& object_set::at(const dl::obname& key) noexcept (false) { return *itr; } +dl::basic_object& pool::at(const dl::ident& type, const dl::obname& name) +noexcept (false) { + // TODO A more clever search + dl::basic_object tmp; + for (auto& eflr : this->eflrs) { + if (eflr.type != type) continue; + + /* There might be multiple EFLR's with the correct type, so ignore + * index error while there are still more eflr's to check. + */ + try { + // TODO handle duplications + return eflr.at(name); + } catch (const std::out_of_range&) {} + } + + const auto msg = "pool.at: No object with fingerprint {}"; + const auto fp = name.fingerprint( dl::decay(type) ) ; + throw std::out_of_range( fmt::format(msg, fp)); +} + +dl::basic_object& pool::at(const dl::objref& id ) noexcept (false) { + return this->at(id.type, id.name); +} + +object_vector pool::match( const std::string& type, + const std::string& name ) +noexcept (false) { + std::regex re_type(type, std::regex_constants::icase); + std::regex re_name(type, std::regex_constants::icase); + + object_vector objs; + for (auto& eflr : this->eflrs) { + if (not std::regex_match(dl::decay(eflr.type), re_type)) continue; + + for (const auto& obj : eflr.objects()) { + const auto id = dl::decay(obj.object_name.id); + if (not std::regex_match(id, re_name)) continue; + objs.push_back(obj); + } + } + return objs; +} + } diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index 8304175fe..42452ac25 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -851,6 +851,10 @@ PYBIND11_MODULE(core, m) { .def( "objects", &dl::object_set::objects ) ; + py::class_< dl::pool >( m, "logical_file" ) + .def( "match", &dl::pool::match ) + ; + py::enum_< dl::representation_code >( m, "reprc" ) .value( "fshort", dl::representation_code::fshort ) .value( "fsingl", dl::representation_code::fsingl ) From bed2b7427d1a898df9ff7130b0bcc2d9e1cabf57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Mon, 13 Jul 2020 12:56:33 +0200 Subject: [PATCH 12/27] Add dl::pool:types A common user pattern is to querry for the available object types. This is a cheap querry as no objects must be parsed in order to return the types. --- lib/extension/dlisio/ext/types.hpp | 2 ++ lib/src/parse.cpp | 8 ++++++++ python/dlisio/ext/core.cpp | 1 + 3 files changed, 11 insertions(+) diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index f7979abb2..baba6227f 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -520,6 +520,8 @@ class pool { public: explicit pool( std::vector< dl::object_set > e ) : eflrs(std::move(e)) {}; + std::vector< dl::ident > types() const noexcept (false); + object_vector match(const std::string& type, const std::string& name); diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index 79b35a71e..b84836e57 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -1055,6 +1055,14 @@ dl::basic_object& object_set::at(const dl::obname& key) noexcept (false) { return *itr; } +std::vector< dl::ident > pool::types() const noexcept (false) { + std::vector< dl::ident > types; + for (auto eflr : this->eflrs) { + types.push_back( eflr.type ); + } + return types; +} + dl::basic_object& pool::at(const dl::ident& type, const dl::obname& name) noexcept (false) { // TODO A more clever search diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index 42452ac25..f59723846 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -852,6 +852,7 @@ PYBIND11_MODULE(core, m) { ; py::class_< dl::pool >( m, "logical_file" ) + .def_property_readonly( "types", &dl::pool::types ) .def( "match", &dl::pool::match ) ; From 71189aaaaa58182b2a399864af10833cca5676ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Wed, 22 Jul 2020 10:33:43 +0200 Subject: [PATCH 13/27] Let dl::pool::match use external matchers Rather then hardcoding a regex matcher in dl::pool's implementation, let the caller pass along their own user. I.e. we dependancy inject a matcher function into dl::pool::match. This also nable us to go back to using python's re. --- lib/extension/dlisio/ext/types.hpp | 14 ++++++++++- lib/src/parse.cpp | 20 +++++++++------ python/dlisio/ext/core.cpp | 39 ++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 9 deletions(-) diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index baba6227f..7a022dcc9 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -515,6 +515,17 @@ struct object_set { bool parsed = false; }; +struct matcher { + virtual bool match(const dl::ident& pattern, const dl::ident& candidate) + noexcept (false) = 0; + virtual ~matcher() = default; +}; + +struct exactmatch : public matcher { + bool match(const dl::ident& pattern, const dl::ident& candidate) + noexcept (false) override; +}; + /* A queryable pool of metadata objects */ class pool { public: @@ -523,7 +534,8 @@ class pool { std::vector< dl::ident > types() const noexcept (false); object_vector match(const std::string& type, - const std::string& name); + const std::string& name, + dl::matcher& matcher); dl::basic_object& at(const dl::ident&, const dl::obname&) noexcept (false); dl::basic_object& at(const dl::objref&) noexcept(false); diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index b84836e57..cf061bf9e 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -4,7 +4,6 @@ #include #include #include -#include #include @@ -992,6 +991,12 @@ const char* parse_set_component( const char* cur, return cur; } +bool exactmatch::match(const dl::ident& pattern, const dl::ident& candidate) +noexcept (false) { + return pattern == candidate; +} + + object_set::object_set(std::vector< char > b) noexcept (false) { parse_set_component(b.data(), b.data() + b.size(), @@ -1089,18 +1094,17 @@ dl::basic_object& pool::at(const dl::objref& id ) noexcept (false) { } object_vector pool::match( const std::string& type, - const std::string& name ) + const std::string& name, + dl::matcher& m) noexcept (false) { - std::regex re_type(type, std::regex_constants::icase); - std::regex re_name(type, std::regex_constants::icase); - object_vector objs; + for (auto& eflr : this->eflrs) { - if (not std::regex_match(dl::decay(eflr.type), re_type)) continue; + if (not m.match(dl::ident{type}, eflr.type)) continue; for (const auto& obj : eflr.objects()) { - const auto id = dl::decay(obj.object_name.id); - if (not std::regex_match(id, re_name)) continue; + if (not m.match(dl::ident{name}, obj.object_name.id)) continue; + objs.push_back(obj); } } diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index f59723846..ee0e15beb 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -688,6 +688,35 @@ noexcept (false) { return dstobj; } +/** trampoline helper class for dl::matcher bindings + * + * Creating the binding code for a abstract c++ class that we want do derive + * new classes from in python requires some extra boilerplate code in the form + * of this "trampoline" class [1]. + * + * This class helps redirect virtual calls back to python and is *not* intended + * to be used for anything other than creating valid bindings for dl::matcher. + * + * [1] https://pybind11.readthedocs.io/en/stable/advanced/classes.html#overriding-virtual-functions-in-python + */ +class Pymatcher : public dl::matcher { +public: + /* Inherit the constructor */ + using dl::matcher::matcher; + + /* Trampoline (need one for each virtual function) */ + bool match(const dl::ident& pattern, const dl::ident& candidate) + noexcept(false) override { + PYBIND11_OVERLOAD_PURE( + bool, /* Return type */ + dl::matcher, /* Parent class */ + match, /* Name of function in C++ (must match Python name) */ + pattern, /* Argument(s) */ + candidate + ); + } +}; + } PYBIND11_MAKE_OPAQUE( std::vector< dl::object_set > ) @@ -990,4 +1019,14 @@ PYBIND11_MODULE(core, m) { m.def("set_encodings", set_encodings); m.def("get_encodings", get_encodings); + + py::class_< dl::matcher, Pymatcher >( m, "matcher") + .def(py::init<>()) + .def("match", &dl::matcher::match) + ; + + py::class_< dl::exactmatch, dl::matcher >( m, "exactmatch" ) + .def(py::init<>()) + .def("match", &dl::exactmatch::match) + ; } From 8715bbb00b56be1130d43d0912095f47d7267c28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Thu, 23 Jul 2020 09:27:06 +0200 Subject: [PATCH 14/27] Add an regex_matcher that uses python's re module --- python/dlisio/plumbing/matcher.py | 36 +++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 python/dlisio/plumbing/matcher.py diff --git a/python/dlisio/plumbing/matcher.py b/python/dlisio/plumbing/matcher.py new file mode 100644 index 000000000..6b6734980 --- /dev/null +++ b/python/dlisio/plumbing/matcher.py @@ -0,0 +1,36 @@ +import re + +from .. import core + +class regex_matcher(core.matcher): + """ Regex matcher + + A regex matcher using Python's re module, that can be passed to + dl::pool::match along with the search patterns. + + Examples + -------- + + create a matcher that is case insensitive and displays debug information, + and pass it to dl::pool with the search patterns for type and name: + + >>> m = matcher(re.IGNORECASE | re.DEBUG) + >>> result = metadata.match("TYPE", "NAME", m) + """ + def __init__(self, flags): + core.matcher.__init__(self) + self.flags = flags + + def match(self, pattern, candidate): + """ Overrides dl::matcher::match """ + try: + compiled = re.compile(str(pattern), flags=self.flags) + except: + msg = 'Invalid regex: {}'.format(pattern) + raise ValueError(msg) + + if (re.match(compiled, str(candidate))): + return True + else: + return False + From 0563a0fdada0baac43453e7cce18ea34ee212e92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Wed, 22 Jul 2020 16:20:12 +0200 Subject: [PATCH 15/27] Fix dl::value_vector comparison for monostate It the current type of an value_vector is mpark::monostate, mpark::visit picks the rvalue overload in variant_equal, which makes sense. However it also picks that overload when both lhs and rhs are monostate, resulting in a false negative for the comparison. A new overload with the same typename for both arguments would also resolve the problem, but I'm afraid that it would swallow more than it should. --- lib/src/parse.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index cf061bf9e..588ec9295 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -541,6 +541,12 @@ struct variant_equal { return false; } + bool operator () (mpark::monostate, + mpark::monostate) + const noexcept (true) { + return true; + } + template < typename T > bool operator () (const std::vector< T >& lhs, const std::vector< T >& rhs) From f6b04424a2ed53c1044f02784b58e8f962902d0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Mon, 13 Jul 2020 13:01:03 +0200 Subject: [PATCH 16/27] Let dl::pool be the driver for metadata in dlis Use dl::pool as the main driver for metadata within the dlis-class. Objects are no longer cached in python and will be read from the c++ pool at each query. --- python/dlisio/__init__.py | 215 +++++++------------------ python/dlisio/ext/core.cpp | 12 ++ python/dlisio/plumbing/__init__.py | 1 + python/dlisio/plumbing/basicobject.py | 6 + python/tests/test_logical_file.py | 51 +----- python/tests/test_logical_record.py | 5 +- python/tests/test_monkey_patching.py | 8 +- python/tests/test_object_structures.py | 1 + python/tests/test_partitioning.py | 13 +- python/tests/test_physical_layout.py | 6 +- 10 files changed, 91 insertions(+), 227 deletions(-) diff --git a/python/dlisio/__init__.py b/python/dlisio/__init__.py index 4ae305670..6c1ba44a6 100644 --- a/python/dlisio/__init__.py +++ b/python/dlisio/__init__.py @@ -215,23 +215,19 @@ class dlis(object): to link the content of attributes to other objects. """ - def __init__(self, stream, attic, fdata_index, encrypted, sul=None): + def __init__(self, stream, metadata, fdata_index, encrypted, sul=None): self.file = stream - self.attic = attic + self.metadata = metadata self.sul = sul self.fdata_index = fdata_index self.encrypted = encrypted + # Use python's re with case-insensitivity as matcher when searching for + # metadata objects in dl::pool + self.matcher = plumbing.regex_matcher(re.IGNORECASE); - self.indexedobjects = defaultdict(dict) - self.problematic = [] + #TODO: deal with 'problematic' - self.record_types = core.parse_set_types(self.attic) - - types = ('FILE-HEADER', 'ORIGIN', 'FRAME', 'CHANNEL') - recs = [rec for rec, t in zip(self.attic, self.record_types) if t in types] - self.load(recs) - - if 'UPDATE' in self.record_types: + if self.metadata.match('UPDATE', '.*', self.matcher): msg = ('{} contains UPDATE-object(s) which changes other ' 'objects. dlisio lacks support for UPDATEs, hence the ' 'data in this logical file might be wrongly presented.') @@ -256,18 +252,8 @@ def __getitem__(self, type): objects : dict all objects of type 'type' """ - if type in self.indexedobjects: - return self.indexedobjects[type] - - recs = [rec - for rec, t - in zip(self.attic, self.record_types) - if t == type - ] - - self.load(recs, reload=False) - - return self.indexedobjects[type] + objs = self.metadata.match('^{}$'.format(type), '.*', self.matcher) + return { x.fingerprint : x for x in self.promote(objs) } def __enter__(self): return self @@ -362,19 +348,12 @@ def unknowns(self): A defaultdict index by object-type """ - recs = [rec for rec, t in zip(self.attic, self.record_types) - if t not in self.types - and not rec.encrypted - and t not in self.indexedobjects - ] - - self.load(recs, reload=False) - - unknowns = defaultdict(list) - - for key, value in self.indexedobjects.items(): - if key in self.types: continue - unknowns[key] = value + unknowns = defaultdict(dict) + for t in self.metadata.types: + if t in unknowns: continue + #TODO handle duplicated types in dl::pool.types + if t in self.types: continue + unknowns[t] = self[t] return unknowns @@ -455,22 +434,8 @@ def match(self, pattern, type="CHANNEL"): Channel(CHANNEL123) """ - def compileregex(pattern): - try: - return re.compile(pattern, re.IGNORECASE) - except: - msg = 'Invalid regex: {}'.format(pattern) - raise ValueError(msg) - - ctype = compileregex(type) - cpattern = compileregex(pattern) - - types = [x for x in set(self.record_types) if re.match(ctype, x)] - - for t in types: - for obj in self[t].values(): - if not re.match(cpattern, obj.name): continue - yield obj + matches = self.metadata.match(type, pattern, self.matcher) + return self.promote(matches) def object(self, type, name, origin=None, copynr=None): """ @@ -504,27 +469,32 @@ def object(self, type, name, origin=None, copynr=None): MKAP """ - if origin is None or copynr is None: - obj = list(self.match('^'+name+'$', type)) - if len(obj) == 1: - return obj[0] - elif len(obj) == 0: - msg = "No objects with name {} and type {} are found" - raise ValueError(msg.format(name, type)) - elif len(obj) > 1: - msg = "There are multiple {}s named {}. Found: {}" - desc = "" - for o in obj: - desc += ("(origin={}, copy={}), " - .format(o.origin, o.copynumber)) - raise ValueError(msg.format(type, name, desc)) + matches = self.metadata.match(type, name, core.exactmatch()) + matches = self.promote(matches) + + if origin is not None: + matches = [o for o in matches if o.origin == origin] + + if copynr is not None: + matches = [o for o in matches if o.copynumber == copynr] + + if len(matches) == 1: return matches[0] + + # Figure out what went wrong + elif len(matches) > 1: + msg = "There are multiple {}s named {}. Found: {}" + desc = "" + for o in matches: + desc += ("(origin={}, copy={}), " + .format(o.origin, o.copynumber)) + raise ValueError(msg.format(type, name, desc)) + + if origin is not None and copynr is not None: + msg = "Object {}.{}.{} of type {} is not found" + raise ValueError(msg.format(name, origin, copynr, type)) else: - fingerprint = core.fingerprint(type, name, origin, copynr) - try: - return self[type][fingerprint] - except KeyError: - msg = "Object {}.{}.{} of type {} is not found" - raise ValueError(msg.format(name, origin, copynr, type)) + msg = "No objects with name {} and type {} are found" + raise ValueError(msg.format(name, type)) def describe(self, width=80, indent=''): """Printable summary of the logical file @@ -555,9 +525,7 @@ def describe(self, width=80, indent=''): plumbing.describe_dict(buf, d, width, indent) known, unknown = {}, {} - for objtype in set(self.record_types): - if objtype == 'encrypted': continue - + for objtype in self.metadata.types: if objtype in self.types: known[objtype] = len(self[objtype]) else: unknown[objtype] = len(self[objtype]) @@ -571,88 +539,19 @@ def describe(self, width=80, indent=''): return plumbing.Summary(info=buf.getvalue()) - def load(self, records=None, reload=True): - """ Load and enrich raw objects into the object pool + def load(self): + """ Force load all metadata - mainly indended for debugging""" + _ = [self[x] for x in self.metadata.types] - This method converts the raw object sets into first-class dlisio python - objects, and puts them in the indexedobjects and problematic members. - - Parameters - ---------- - - records : iterable of records - - reload : bool - If False, append the new object too the pool of objects. If True, - overwrite the existing pool with new objects. - - Notes - ----- - - This method is mainly intended for internal use. It serves as a worker - for other methods that needs to populate the pool with new objects. - It's the callers responsibility to keep track of the current state of - the pool, and not load the same objects into the pool several times. - - Examples - -------- - - When opening a file with dlisio.load('path') only a few object-types - are loaded into the pool. If need be, it is possible to force dlisio to - load every object in the file into its internal cache: - - >>> with dlisio.load('file') as (f, *tail): - ... f.load() - - """ - problem = 'multiple distinct objects ' - where = 'in set {} ({}). Duplicate fingerprint = {}' - action = 'continuing with the last object' - duplicate = 'duplicate fingerprint {}' - - if reload: - indexedobjects = defaultdict(dict) - problematic = [] - - else: - indexedobjects = self.indexedobjects - problematic = self.problematic - - if records is None: records = self.attic - sets = core.parse_objects(records) - - for os in sets: - # TODO: handle replacement sets - for o in os.objects(): - try: - obj = self.types[os.type](o, name = o.name, lf = self) - except KeyError: - obj = plumbing.Unknown( - o, - name = o.name, - type = os.type, - lf = self - ) - - fingerprint = obj.fingerprint - if fingerprint in indexedobjects[os.type]: - original = indexedobjects[os.type][fingerprint] - - logging.info(duplicate.format(fingerprint)) - if original.attic != obj.attic: - msg = problem + where - msg = msg.format(os.type, os.name, fingerprint) - logging.error(msg) - logging.warning(action) - problematic.append((original, obj)) - - indexedobjects[obj.type][fingerprint] = obj - - - self.indexedobjects = indexedobjects - self.problematic = problematic - - return self + def promote(self, objects): + objs = [] + for o in objects: + try: + obj = self.types[o.type](o, name=o.name, lf=self) + except KeyError: + obj = plumbing.Unknown(o, name=o.name, type=o.type, lf=self) + objs.append(obj) + return objs def storage_label(self): """Return the storage label of the physical file @@ -787,12 +686,12 @@ def rewind(offset, tif): explicits, implicits, encrypted = core.findoffsets(stream) hint = rewind(stream.absolute_tell, tapemarks) - records = core.extract(stream, [x.tell for x in explicits]) + metadata = core.metadata_pool(stream, explicits) fdata_index = defaultdict(list) for key, val in core.findfdata(stream, [x.tell for x in implicits]): fdata_index[key].append(val) - lf = dlis(stream, records, fdata_index, encrypted, sul) + lf = dlis(stream, metadata, fdata_index, encrypted, sul) lfs.append(lf) try: diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index ee0e15beb..6fd6309c6 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -1017,6 +1017,18 @@ PYBIND11_MODULE(core, m) { return py::make_tuple( ofs.explicits, ofs.implicits, ofs.encrypted ); }); + m.def( "metadata_pool", []( dl::stream& s, + const std::vector< dl::index_entry >& index ) { + std::vector< dl::object_set > tmp; + for (const auto& eflr : index) { + if (eflr.isencrypted()) continue; + auto rec = dl::extract(s, eflr.tell); + if (rec.data.size() == 0) continue; + tmp.push_back( dl::object_set( rec.data ) ); + } + return dl::pool{ std::move(tmp) }; + }); + m.def("set_encodings", set_encodings); m.def("get_encodings", get_encodings); diff --git a/python/dlisio/plumbing/__init__.py b/python/dlisio/plumbing/__init__.py index 58a1f70e4..602aaee41 100644 --- a/python/dlisio/plumbing/__init__.py +++ b/python/dlisio/plumbing/__init__.py @@ -22,6 +22,7 @@ from .process import Process from .unknown import Unknown +from .matcher import * from .valuetypes import * from .linkage import * from .utils import * diff --git a/python/dlisio/plumbing/basicobject.py b/python/dlisio/plumbing/basicobject.py index b188484b3..1e2c88a93 100644 --- a/python/dlisio/plumbing/basicobject.py +++ b/python/dlisio/plumbing/basicobject.py @@ -221,6 +221,12 @@ def __getitem__(self, key): return parsevalue(value, parse_as) + def __eq__(self, rhs): + try: + return self.attic == rhs.attic + except AttributeError: + return False + @property def fingerprint(self): """ Object fingerprint diff --git a/python/tests/test_logical_file.py b/python/tests/test_logical_file.py index 5479ba351..7f36cab1c 100644 --- a/python/tests/test_logical_file.py +++ b/python/tests/test_logical_file.py @@ -33,7 +33,7 @@ def test_object_nonexisting(f): with pytest.raises(ValueError): _ = f.object("CHANNEL", "CHANN1", 11, 0) - with pytest.raises(TypeError): + with pytest.raises(ValueError): _ = f.object("WEIRD", "CHANN1", "-1", "-1") def test_object_solo_nameonly(f): @@ -142,52 +142,7 @@ def test_indexedobjects(f): assert len(f.comments) == 1 assert len(f.messages) == 1 -def test_indexedobjects_initial_load(fpath): - with dlisio.load(fpath) as (f, *tail): - # Only fileheader, origin, frame, and channel should be loaded - assert len(f.indexedobjects) == 4 - - -def test_indexedobjects_load_all(fpath): - with dlisio.load(fpath) as (f, *_): - f.load() - assert len(f.indexedobjects) == 23 - -def test_indexedobjects_load_unknowns(): +def test_load_unknowns(): with dlisio.load('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as (f,): - assert len(f.indexedobjects) == 4 #FILE-HEADER, ORIGIN, FRAME, CHANNEL - assert len(f.unknowns) == 5 - assert len(f.indexedobjects) == 9 - -def test_indexedobjects_load_by_typeloading(fpath): - with dlisio.load(fpath) as (f, *tail): - fp = core.fingerprint('PARAMETER', 'PARAM1', 10, 0) - parameters = f.parameters - - assert len(parameters) == 3 - assert fp in f.indexedobjects['PARAMETER'] - -def test_indexedobjects_load_by_direct_call(fpath): - with dlisio.load(fpath) as (f, *tail): - fp = core.fingerprint('TOOL', 'TOOL1', 10, 0) - _ = f.object('TOOL', 'TOOL1', 10, 0) - - assert fp in f.indexedobjects['TOOL'] - -def test_indexedobjects_load_by_match(fpath): - with dlisio.load(fpath) as (f, *tail): - fp = core.fingerprint('MESSAGE', 'MESSAGE1', 10, 0) - - _ = list(f.match('.*' , type='MESSAGE')) - - assert fp in f.indexedobjects['MESSAGE'] - -def test_indexedobjects_load_by_link(fpath): - with dlisio.load(fpath) as (f, *tail): - fp = core.fingerprint('LONG-NAME', 'CHANN1-LONG-NAME', 10, 0) - ch = f.object('CHANNEL', 'CHANN1') + assert len(f.unknowns) == 5 - # Accessing long-name should trigger loading of all long-names - _ = ch.long_name - assert fp in f.indexedobjects['LONG-NAME'] - assert len(f.indexedobjects['LONG-NAME']) == 4 diff --git a/python/tests/test_logical_record.py b/python/tests/test_logical_record.py index 4f3fab24f..38c9cade9 100644 --- a/python/tests/test_logical_record.py +++ b/python/tests/test_logical_record.py @@ -555,7 +555,6 @@ def test_unexpected_attribute_in_set(tmpdir, merge_files_oneLR): dlisio.load(path) assert "expected SET" in str(excinfo.value) - def test_unexpected_set_in_object(tmpdir, merge_files_oneLR): path = os.path.join(str(tmpdir), 'unexpected-set-in-object.dlis') content = [ @@ -631,9 +630,7 @@ def test_cut_before_object(tmpdir, merge_files_oneLR): ] merge_files_oneLR(path, content) with dlisio.load(path) as (f,): - objects = {} - for v in f.indexedobjects.values(): - objects.update(v) + objects = f.match('.*', '.*') assert len(objects) == 0 diff --git a/python/tests/test_monkey_patching.py b/python/tests/test_monkey_patching.py index a1fdb4895..c5874868f 100644 --- a/python/tests/test_monkey_patching.py +++ b/python/tests/test_monkey_patching.py @@ -52,7 +52,7 @@ def test_type_new(f): finally: del f.types['UNKNOWN_SET'] - +@pytest.mark.skip(reason='typechange not propegated to dl::pool') def test_type_change(f): try: # Parse all parameters as if they where Channels @@ -100,12 +100,8 @@ def test_type_removal(f): # to not interfere with other tests f.types['CHANNEL'] = dlisio.plumbing.Channel - f.load() - obj = f.object('CHANNEL', 'CHANN1', 10, 0) - # Channels should now be parsed as Channel.allobjects - assert isinstance(obj, dlisio.plumbing.Channel) - assert obj not in f.unknowns + assert 'CHANNEL' not in f.unknowns def test_attribute_change_in_instance(): ch1 = Channel() diff --git a/python/tests/test_object_structures.py b/python/tests/test_object_structures.py index 291f3c6d1..967cec25c 100644 --- a/python/tests/test_object_structures.py +++ b/python/tests/test_object_structures.py @@ -5,6 +5,7 @@ from datetime import datetime import numpy as np +import pytest import dlisio def test_file_header(f): diff --git a/python/tests/test_partitioning.py b/python/tests/test_partitioning.py index d481fe4df..8eb94684c 100644 --- a/python/tests/test_partitioning.py +++ b/python/tests/test_partitioning.py @@ -10,10 +10,7 @@ def test_partitioning(): assert len(tail) == 0 def getobjects(f): - objects = {} - for v in f.indexedobjects.values(): - objects.update(v) - return objects + return f.match('.*', '.*') assert len(getobjects(f1)) == 8 assert len(getobjects(f2)) == 7 @@ -21,14 +18,14 @@ def getobjects(f): key = dlisio.core.fingerprint('FRAME', 'FRAME-INC', 10, 0) - assert f1.record_types == ['ORIGIN', 'CHANNEL', 'FRAME'] + assert f1.metadata.types == ['ORIGIN', 'CHANNEL', 'FRAME'] assert not f1.fdata_index - assert f2.record_types == ['FILE-HEADER', 'ORIGIN', 'CHANNEL', - 'FRAME', 'FRAME'] + assert f2.metadata.types == ['FILE-HEADER', 'ORIGIN', 'CHANNEL', + 'FRAME', 'FRAME'] assert f2.fdata_index[key] == [824, 1060] - assert f3.record_types == ['FILE-HEADER'] + assert f3.metadata.types == ['FILE-HEADER'] assert not f3.fdata_index def test_objects_ownership(): diff --git a/python/tests/test_physical_layout.py b/python/tests/test_physical_layout.py index 615572937..0e21c419b 100644 --- a/python/tests/test_physical_layout.py +++ b/python/tests/test_physical_layout.py @@ -45,19 +45,19 @@ def test_lrs_atributes_inconsistency(): def test_padbytes_as_large_as_record(): path = 'data/chap2/padbytes-large-as-record.dlis' with dlisio.load(path) as (f,): - assert len(f.attic) == 0 + assert len(f.match('.*', '.*')) == 0 assert len(f.fdata_index) == 0 def test_padbytes_as_large_as_segment_explicit(): path = 'data/chap2/padbytes-large-as-seg-explict.dlis' with dlisio.load(path) as (f,): - assert len(f.attic) == 0 + assert len(f.match('.*', '.*')) == 0 assert len(f.fdata_index) == 0 def test_padbytes_as_large_as_segment_implicit(): path = 'data/chap2/padbytes-large-as-seg-implicit.dlis' with dlisio.load(path) as (f,): - assert len(f.attic) == 0 + assert len(f.match('.*', '.*')) == 0 assert len(f.fdata_index) == 0 def test_padbytes_bad(): From cb9c4bc45a9c70dc3b1adb79772fe945d8a7219e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Wed, 15 Jul 2020 16:03:19 +0200 Subject: [PATCH 17/27] Set infocodes on dl::object_attribute when parsing Due to the intricate structure of a dlis-file, dlisio over-parse when a certain pieces of information is queried. This would typically makes any warnings or errors pre-mature and might result in the query failing due to an error in some (from a user-perspective) unrelated data. To combat this issue, any errors, inconsistencies or actions taken by parse_object is communicated through enums set on the parsed objects. The consumer of the objects are then responsible for investigating the this information before using the object. The intention is to follow up on with the same approach basic_object and object_set. --- lib/extension/dlisio/ext/types.hpp | 87 ++++++++++++++++ lib/src/parse.cpp | 147 +++++++++++++++++++++++++--- python/dlisio/ext/core.cpp | 37 ++++++- python/tests/conftest.py | 7 ++ python/tests/test_logical_record.py | 35 ++++--- 5 files changed, 282 insertions(+), 31 deletions(-) diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index 7a022dcc9..a65277111 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -68,6 +68,60 @@ enum class representation_code : std::uint8_t { undef = DLIS_UNDEF, }; +/* Parsing info + * + * These enum values are intended to be set _on_ the parsed objects created by + * dlisio's parsing routines. The values indicate how the object was parsed and + * potential inconsistencies, _recoverable_ errors or other relevant information found by + * the parsing routine. + * + * The enum values can be implicitly converted into std::error_code, which + * preserve the original value and offers a human readable description of the + * error. + * + * Each value is associated with a severity degree. + * + * Example: + * + * std::error_code code = dl::parsing_info:invar; + * if ( code == dl::parsing_severity::debug ) + * std::cout << err.message(); + * + * Output: + * "attr.invariant != 0" + * + */ +enum class parsing_info { + /* info */ absent_value = 1, // attr.count == 0 means absent value + /* warn */ dlisio_default = 2, // Value defaulted by dlisio + /* debug */ attr_invar = 3, // attr.invariant != 0 + /* debug */ attr_label = 4, // attr.label != 0 + /* debug */ noval_count = 5, // attr.count > 0, !attr.value + /* debug */ reprc_ne = 6, // attr.reprc != tmpl.reprc + /* debug */ count_eq = 7, // attr.count == tmpl.count + /* debug */ count_lt = 8, // attr.count < tmpl.count + /* debug */ count_gt = 9, // attr.count > tmpl.count + /* debug */ nodefault = 10, // no tmpl.value + /* warn */ reprc_invalid = 11, // unknown attr.reprc +}; + +/* Construct an std::error_code object from dl::parsing_info. std::error_code's + * constructor uses augment-dependent lookup [1] to find and execute this + * function. This makes it possible for us to tell the constructor how to + * cast from dl::parsing_info. + * + * [1] https://en.wikipedia.org/wiki/Argument-dependent_name_lookup + */ +std::error_code make_error_code(parsing_info); + +enum class parsing_severity { + info = 1, // Confirmation that things are working as expected + debug = 2, // Information that may be interesting when debugging + warning = 3, // Information that may be interesting to anyone +}; + +std::error_condition make_error_condition(parsing_severity); + /* * It's _very_ often necessary to access the raw underlying type of the strong * type aliases for comparisons, literals, or conversions. dl::decay inspects @@ -392,6 +446,24 @@ using value_vector = mpark::variant< /* * The structure of an attribute as described in 3.2.2.1 + * + * Error handling: + * + * Due to the intricate structure of a dlis-file, dlisio typically over-parse + * when a certain pieces of information is queried. This would typically makes + * any warnings or errors issued from the parsing routines pre-mature and might + * result in the query failing due to an error in some (from a + * user-perspective) unrelated data. + * + * To combat this, the result of parsing routines (and the state of the + * parsed object) is communicated through error_codes set on the object. + * + * It is the consumers responsibility of checking the state of the + * object before using it's content. + * + * object_attribute.errcodes contains a list of enumerated values from + * dl::parsing_info. These can be implicitly translated into std::error_code + * objects which offers human readable descriptions of the error(s). */ struct object_attribute { dl::ident label = {}; @@ -402,6 +474,8 @@ struct object_attribute { dl::value_vector value = {}; bool invariant = false; + std::vector< dl::parsing_info > info; + bool operator == (const object_attribute& ) const noexcept (true); }; @@ -587,4 +661,17 @@ struct index_entry { } +namespace std { + /* Register parsing_info for implicit conversion to std::error_code */ + template <> + struct is_error_code_enum + : public true_type {}; + + /* Register parsing_severity for implicit conversion to std::error_condition + */ + template <> + struct is_error_condition_enum + : public true_type {}; +} + #endif //DLISIO_EXT_TYPES_HPP diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index 588ec9295..c2f8e5a1d 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -12,6 +12,109 @@ namespace { +struct parsing_info_cat : std::error_category { + const char* name() const noexcept (true) override; + std::string message(int ev) const override; +}; + +struct parsing_sev_cat : std::error_category { + const char* name() const noexcept (true) override; + std::string message(int ev) const override; + + bool equivalent( const std::error_code& code, int condition) + const noexcept (true) override; +}; + +const char* parsing_info_cat::name() const noexcept (true) { + return "parsing info"; +} + +std::string parsing_info_cat::message(int ev) const +{ + using attr = dl::parsing_info; + switch (static_cast(ev)) { + case attr::absent_value: + return "attr.count==0, value is undefined"; + case attr::dlisio_default: + return "value is defaulted by dlisio, see debug info"; + case attr::attr_invar: + return "attr.invariant != 0, ignoring"; + case attr::attr_label: + return "attr.label != 0, ignoring"; + case attr::noval_count: + return "!attr.value, attr.count > 0"; + case attr::reprc_ne: + return "attr.reprc != templ.reprc"; + case attr::count_eq: + return "attr.count == tmpl.count, using tmpl.value"; + case attr::count_lt: + return "attr.count < tmpl.count, using tmpl.value"; + case attr::count_gt: + return "attr.count > tmpl.count, using tmpl.value"; + case attr::nodefault: + return "!tmpl.value, using empty value with type attr.reprc"; + case attr::reprc_invalid: + return "attr.reprc invalid, using monostate as value"; + default: + return "unrecognised parsing infocode"; + } +} + +const char* parsing_sev_cat::name() const noexcept (true) { + return "Parsing info severity"; +} + +std::string parsing_sev_cat::message(int ev) const { + switch (static_cast< dl::parsing_severity >(ev)) { + case dl::parsing_severity::info: + return "Confirmation that things are working as expected"; + case dl::parsing_severity::debug: + return "Information that may be interesting when debugging"; + case dl::parsing_severity::warning: + default: + return "unrecognised severity code"; + } +} + +bool parsing_sev_cat::equivalent( const std::error_code& code, int condition ) +const noexcept (true) { + using attr = dl::parsing_info; + switch ( static_cast< dl::parsing_severity >(condition) ){ + case dl::parsing_severity::info: + return code == attr::absent_value; + + case dl::parsing_severity::warning: + return code == attr::dlisio_default + || code == attr::reprc_invalid; + + case dl::parsing_severity::debug: + return code == attr::attr_invar + || code == attr::attr_label + || code == attr::noval_count + || code == attr::reprc_ne + || code == attr::count_eq + || code == attr::count_lt + || code == attr::count_gt + || code == attr::nodefault; + default: + return false; + } +} + +/* The identity of an category is determined by it's address. Following the + * example set by the standard we provide a function that always returns a + * reference to the same object. + */ +const std::error_category& parsing_info_category() { + static parsing_info_cat instance; + return instance; +} + +const std::error_category& parsing_severity_category() { + static parsing_sev_cat instance; + return instance; +} + void user_warning( const std::string& ) noexcept (true) { // TODO: } @@ -566,6 +669,14 @@ noexcept (true) { namespace dl { +std::error_code make_error_code(parsing_info e) { + return std::error_code( static_cast(e), parsing_info_category()); +} + +std::error_condition make_error_condition(parsing_severity e) { + return std::error_condition( static_cast(e), parsing_severity_category() ); +} + bool object_attribute::operator == (const object_attribute& o) const noexcept (true) { return this->label == o.label @@ -776,7 +887,8 @@ struct shrink { void patch_missing_value( dl::value_vector& value, std::size_t count, - dl::representation_code reprc ) + dl::representation_code reprc, + std::vector< dl::parsing_info >& info) noexcept (false) { /* @@ -786,11 +898,15 @@ noexcept (false) if (!mpark::holds_alternative< mpark::monostate >(value)) { const auto size = mpark::visit( len(), value ); /* same size, so return */ - if (size == count) return; + if (size == count) { + info.push_back(dl::parsing_info::count_eq); + return; + } /* smaller, shrink and all is fine */ if (size > count) { mpark::visit( shrink( count ), value ); + info.push_back(dl::parsing_info::count_lt); return; } @@ -799,6 +915,7 @@ noexcept (false) * exception and consider what to do when a file actually uses this * behaviour */ + //TODO use dl::parsing_info::count_gt instead of throw const auto msg = "object attribute without no explicit value, but " "count (which is {}) > size (which is {})" ; @@ -814,6 +931,7 @@ noexcept (false) * making this switch work in the general case */ + info.push_back(dl::parsing_info::nodefault); using rpc = dl::representation_code; switch (reprc) { case rpc::fshort: reset< dl::fshort >(value).resize(count); return; @@ -844,10 +962,8 @@ noexcept (false) case rpc::status: reset< dl::status >(value).resize(count); return; case rpc::units: reset< dl::units >(value).resize(count); return; default: { - const auto msg = "unable to patch attribute with no value: " - "unknown representation code {}"; - const auto code = static_cast< int >(reprc); - throw std::runtime_error(fmt::format(msg, code)); + info.push_back(dl::parsing_info::reprc_invalid); + value = mpark::monostate{}; } } } @@ -901,12 +1017,11 @@ object_vector parse_objects( const object_template& tmpl, * Assume this is a mistake, assume it was a regular * non-invariant attribute */ - user_warning("ATTRIB:invariant in attribute, " - "but should only be in template"); + attr.info.push_back(dl::parsing_info::attr_invar); } if (flags.label) { - user_warning( "ATTRIB:label set, but must be null"); + attr.info.push_back(dl::parsing_info::attr_label); } if (flags.count) cur = cast( cur, attr.count ); @@ -927,6 +1042,7 @@ object_vector parse_objects( const object_template& tmpl, */ if (count == 0) { attr.value = mpark::monostate{}; + attr.info.push_back(dl::parsing_info::absent_value); } else if (!flags.value) { /* * Count is non-zero, but there's no value for this attribute. @@ -939,16 +1055,17 @@ object_vector parse_objects( const object_template& tmpl, * TODO: in the future it's possible to allow promotion between * certain codes (ident -> ascii), but is no need for now */ - + attr.info.push_back(dl::parsing_info::noval_count); if (flags.reprc && attr.reprc != template_attr.reprc) { - const auto msg = "count ({}) isn't 0 and representation " - "code ({}) changed, but value is not explicitly set"; - const auto code = static_cast< int >(attr.reprc); - user_warning(fmt::format(msg, count, code)); attr.value = mpark::monostate{}; + attr.info.push_back(dl::parsing_info::reprc_ne); } - patch_missing_value( attr.value, count, attr.reprc ); + patch_missing_value( attr.value, + count, + attr.reprc, + attr.info ); + attr.info.push_back(dl::parsing_info::dlisio_default); } current.set(attr); diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index 6fd6309c6..5b56d2225 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -717,6 +717,22 @@ class Pymatcher : public dl::matcher { } }; +void report( const std::vector< std::error_code >& codes, + const std::string& context ) noexcept (false) { + + py::module logging = py::module::import("logging"); + using level = dl::parsing_severity; + for (const auto& code : codes) { + const std::string msg = context + ": " + code.message(); + + if (code == level::debug) logging.attr("debug")(msg); + else if (code == level::info) logging.attr("info")(msg); + else if (code == level::warning) logging.attr("warning")(msg); + /* default to logging.warning for now - should never get her anyway*/ + else logging.attr("warning")(msg); + }; +} + } PYBIND11_MAKE_OPAQUE( std::vector< dl::object_set > ) @@ -858,8 +874,25 @@ PYBIND11_MODULE(core, m) { .def( "__eq__", &dl::basic_object::operator == ) .def( "__ne__", &dl::basic_object::operator != ) .def( "__getitem__", []( dl::basic_object& o, const std::string& key ) { - try { return o.at(key).value; } - catch (const std::out_of_range& e) { throw py::key_error( e.what() ); } + dl::object_attribute attr; + try { + attr = o.at(key); + } catch (const std::out_of_range& e) { + throw py::key_error( e.what() ); + } + + if (attr.info.size()) { + const auto msg = o.object_name.fingerprint(dl::decay( o.type )) + + "-A." + + dl::decay( attr.label ); + + std::vector< std::error_code > codes( attr.info.begin(), + attr.info.end() ); + + report(codes, msg); + } + + return attr.value; }) .def( "__repr__", []( const dl::basic_object& o ) { return "dlisio.core.basic_object(name={})"_s diff --git a/python/tests/conftest.py b/python/tests/conftest.py index 6f0a38e45..f951d2f40 100644 --- a/python/tests/conftest.py +++ b/python/tests/conftest.py @@ -110,6 +110,13 @@ def assert_message(message_id): assert any([message_id in r.message for r in caplog.records]) return assert_message +@pytest.fixture +def assert_debug(caplog): + caplog.set_level(logging.DEBUG) + def assert_message(message_id): + assert any([message_id in r.message for r in caplog.records]) + return assert_message + @pytest.fixture(scope="module") def fpath(tmpdir_factory, merge_files_manyLR): """ diff --git a/python/tests/test_logical_record.py b/python/tests/test_logical_record.py index 38c9cade9..9aff8af7d 100644 --- a/python/tests/test_logical_record.py +++ b/python/tests/test_logical_record.py @@ -62,9 +62,7 @@ def test_invariant_attribute(tmpdir, merge_files_oneLR): #assert attr.units == 'invariant units' assert attr == [False, False, True] - -@pytest.mark.future_warning_invariant_attribute -def test_invariant_attribute_in_object(tmpdir, merge_files_oneLR): +def test_invariant_attribute_in_object(tmpdir, merge_files_oneLR, assert_debug): path = os.path.join(str(tmpdir), 'invariant-attribute-in-object.dlis') content = [ 'data/chap3/start.dlis.part', @@ -79,6 +77,7 @@ def test_invariant_attribute_in_object(tmpdir, merge_files_oneLR): attr = obj.attic['DEFAULT_ATTRIBUTE'] assert attr == [8.0] + assert_debug('attr.invariant') def test_absent_attribute(tmpdir, merge_files_oneLR): path = os.path.join(str(tmpdir), 'attribute_absent.dlis') @@ -301,8 +300,7 @@ def test_repcode_invalid_in_objects(tmpdir, merge_files_oneLR): f.load() assert "unknown representation code" in str(excinfo.value) -@pytest.mark.future_warning_repcode_different_no_value -def test_repcode_different_no_value(tmpdir, merge_files_oneLR): +def test_repcode_different_no_value(tmpdir, merge_files_oneLR, assert_log, assert_debug): path = os.path.join(str(tmpdir), 'different-repcode-no-value.dlis') content = [ 'data/chap3/start.dlis.part', @@ -316,8 +314,11 @@ def test_repcode_different_no_value(tmpdir, merge_files_oneLR): obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) assert obj.attic['DEFAULT_ATTRIBUTE'] == [0j, 0j] -@pytest.mark.future_test_attributes -def test_count0_novalue(tmpdir, merge_files_oneLR): + assert_debug('!attr.value') + assert_debug('attr.reprc != templ.reprc') + assert_log('defaulted by dlisio') + +def test_count0_novalue(tmpdir, merge_files_oneLR, assert_info): path = os.path.join(str(tmpdir), 'count0-novalue.dlis') content = [ 'data/chap3/start.dlis.part', @@ -333,9 +334,10 @@ def test_count0_novalue(tmpdir, merge_files_oneLR): #assert attr.count == 0 assert attr == None + assert_info('value is undefined') -@pytest.mark.future_test_attributes -def test_count0_value_bit(tmpdir, merge_files_oneLR): + +def test_count0_value_bit(tmpdir, merge_files_oneLR, assert_info): path = os.path.join(str(tmpdir), 'count0-value-bit.dlis') content = [ 'data/chap3/start.dlis.part', @@ -351,6 +353,7 @@ def test_count0_value_bit(tmpdir, merge_files_oneLR): #assert attr.count == 0 assert attr == None + assert_info('value is undefined') @pytest.mark.future_test_attributes def test_count0_different_repcode(tmpdir, merge_files_oneLR): @@ -371,8 +374,7 @@ def test_count0_different_repcode(tmpdir, merge_files_oneLR): assert attr == None -@pytest.mark.future_warning_label_bit_set_in_object_attr -def test_label_bit_set_in_attribute(tmpdir, merge_files_oneLR): +def test_label_bit_set_in_attribute(tmpdir, merge_files_oneLR, assert_log, assert_debug): path = os.path.join(str(tmpdir), 'label_bit_set_in_attribute.dlis') content = [ 'data/chap3/start.dlis.part', @@ -384,8 +386,11 @@ def test_label_bit_set_in_attribute(tmpdir, merge_files_oneLR): with dlisio.load(path) as (f, *tail): obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) - assert obj.attic['DEFAULT_ATTRIBUTE'] + assert obj['DEFAULT_ATTRIBUTE'] + assert_debug('attr.label') + assert_debug('!attr.value') + assert_log('defaulted by dlisio') @pytest.mark.future_warning_label_bit_not_set_in_template @pytest.mark.not_implemented_datetime_timezone @@ -436,8 +441,7 @@ def test_object_name_bit_not_set_in_object(tmpdir, merge_files_oneLR): assert obj.attic['DEFAULT_ATTRIBUTE'] -@pytest.mark.future_test_attributes -def test_novalue_less_count(tmpdir, merge_files_oneLR): +def test_novalue_less_count(tmpdir, merge_files_oneLR, assert_log, assert_debug): path = os.path.join(str(tmpdir), 'novalue-less-count.dlis') content = [ 'data/chap3/start.dlis.part', @@ -455,6 +459,9 @@ def test_novalue_less_count(tmpdir, merge_files_oneLR): #assert attr.units == 'default attr units' assert attr == [-0.75] + assert_debug('!attr.value, attr.count') + assert_debug('attr.count < tmpl.count') + assert_log('defaulted by dlisio') @pytest.mark.not_implemented def test_novalue_more_count(tmpdir, merge_files_oneLR): From 63b8500242cce706a76256f28c36ac4cd547f211 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Fri, 17 Jul 2020 14:53:20 +0200 Subject: [PATCH 18/27] Move object_attribute.invariant to attribute.info Reap the benefits of dl::parsing_info's logging capabilities by moving invariant into dl::attribute.info. --- lib/extension/dlisio/ext/types.hpp | 2 +- lib/src/parse.cpp | 14 +++++++++++--- python/tests/test_logical_record.py | 4 +++- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index a65277111..fe474f3a6 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -103,6 +103,7 @@ enum class parsing_info { /* debug */ count_gt = 9, // attr.count > tmpl.count /* debug */ nodefault = 10, // no tmpl.value /* warn */ reprc_invalid = 11, // unknown attr.reprc + /* info */ invariant = 12, // Invariant attribute }; /* Construct an std::error_code object from dl::parsing_info. std::error_code's @@ -472,7 +473,6 @@ struct object_attribute { representation_code reprc = representation_code::ident; dl::units units = {}; dl::value_vector value = {}; - bool invariant = false; std::vector< dl::parsing_info > info; diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index c2f8e5a1d..6a631c81d 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -35,6 +35,8 @@ std::string parsing_info_cat::message(int ev) const switch (static_cast(ev)) { case attr::absent_value: return "attr.count==0, value is undefined"; + case attr::invariant: + return "Invariant attribute, using default value from template"; case attr::dlisio_default: return "value is defaulted by dlisio, see debug info"; case attr::attr_invar: @@ -81,7 +83,8 @@ const noexcept (true) { using attr = dl::parsing_info; switch ( static_cast< dl::parsing_severity >(condition) ){ case dl::parsing_severity::info: - return code == attr::absent_value; + return code == attr::absent_value + || code == attr::invariant; case dl::parsing_severity::warning: return code == attr::dlisio_default @@ -837,7 +840,8 @@ const char* parse_template( const char* cur, if (flags.value) cur = elements( cur, attr.count, attr.reprc, attr.value ); - attr.invariant = flags.invariant; + if (flags.invariant) + attr.info.push_back(dl::parsing_info::invariant); tmp.push_back( std::move( attr ) ); @@ -988,7 +992,11 @@ object_vector parse_objects( const object_template& tmpl, if (object_flags.name) cur = cast( cur, current.object_name ); for (const auto& template_attr : tmpl) { - if (template_attr.invariant) continue; + auto invariant = std::find( template_attr.info.begin(), + template_attr.info.end(), + dl::parsing_info::invariant ) + != template_attr.info.end(); + if (invariant) continue; if (cur == end) break; const auto flags = parse_attribute_descriptor( cur ); diff --git a/python/tests/test_logical_record.py b/python/tests/test_logical_record.py index 9aff8af7d..5ec7b2e08 100644 --- a/python/tests/test_logical_record.py +++ b/python/tests/test_logical_record.py @@ -45,7 +45,7 @@ def test_default_attribute_cut(tmpdir, merge_files_oneLR): assert obj.attic['DEFAULT_ATTRIBUTE'] @pytest.mark.future_test_attributes -def test_invariant_attribute(tmpdir, merge_files_oneLR): +def test_invariant_attribute(tmpdir, merge_files_oneLR, assert_info): path = os.path.join(str(tmpdir), 'invariant_attribute.dlis') content = [ 'data/chap3/start.dlis.part', @@ -62,6 +62,8 @@ def test_invariant_attribute(tmpdir, merge_files_oneLR): #assert attr.units == 'invariant units' assert attr == [False, False, True] + assert_info('Invariant attribute') + def test_invariant_attribute_in_object(tmpdir, merge_files_oneLR, assert_debug): path = os.path.join(str(tmpdir), 'invariant-attribute-in-object.dlis') content = [ From ee361632597e04daa990c3a1b407faba8ad0a7d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Wed, 22 Jul 2020 13:23:34 +0200 Subject: [PATCH 19/27] Update encodings tests Moving the object-pool to c++ generally postpone any UnicodeWarnings from f.load() to whenever the broken string is accessed. And as a side effect of using dl::basic_object as attic we get better encoding support for strings. That is due to the fact that our string encoding is flawed: decode_str only applies to our custom type_caster for dl::ident etc. While pybind11's default caster for string-like objects eludes decode_str. --- .../chap3/object/broken-utf8-object.dlis.part | Bin 8 -> 8 bytes python/tests/test_encodings.py | 73 +++++++++++------- 2 files changed, 45 insertions(+), 28 deletions(-) diff --git a/python/data/chap3/object/broken-utf8-object.dlis.part b/python/data/chap3/object/broken-utf8-object.dlis.part index 66921079881b83badc9948287f4b4af915cb976d..bee27e792a735365735b3f40b8bce5c31205a1b9 100644 GIT binary patch literal 8 PcmXTmVPJXv@X03t4EY2j literal 8 PcmXTmVPH9Z@W>?q3%LWM diff --git a/python/tests/test_encodings.py b/python/tests/test_encodings.py index 14bf7137b..bdc6504e8 100644 --- a/python/tests/test_encodings.py +++ b/python/tests/test_encodings.py @@ -58,7 +58,6 @@ def test_broken_utf8_value(tmpdir, merge_files_oneLR): finally: dlisio.set_encodings(prev_encodings) -@pytest.mark.skip(reason="not warn on warning and sigabrt on second") def test_broken_utf8_obname_value(tmpdir, merge_files_oneLR): path = os.path.join(str(tmpdir), 'broken_utf8_obname_value.dlis') content = [ @@ -68,20 +67,24 @@ def test_broken_utf8_obname_value(tmpdir, merge_files_oneLR): 'data/chap3/objattr/broken-utf8-obname-value.dlis.part', ] merge_files_oneLR(path, content) - with pytest.warns(UnicodeWarning): - with dlisio.load(path): - pass + prev_encodings = dlisio.get_encodings() + dlisio.set_encodings([]) + try: + f, = dlisio.load(path) + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) + obname = obj['DEFAULT_ATTRIBUTE'][0] + + with pytest.warns(UnicodeWarning): + _ = obname.id + dlisio.set_encodings(['koi8_r']) - with dlisio.load(path) as (f, *_): - obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) - obname = (2, 2, 'КОТ') - assert obj.attic['DEFAULT_ATTRIBUTE'] == [obname] + assert obname.id == 'КОТ' finally: dlisio.set_encodings(prev_encodings) + f.close() -@pytest.mark.xfail(strict=True, reason="fingerprint error on no encoding") def test_broken_utf8_object_name(tmpdir, merge_files_oneLR): #some actual files have obname which fails with utf-8 codec path = os.path.join(str(tmpdir), 'broken_utf8_object_name.dlis') @@ -91,18 +94,24 @@ def test_broken_utf8_object_name(tmpdir, merge_files_oneLR): 'data/chap3/object/broken-utf8-object.dlis.part', ] merge_files_oneLR(path, content) - with pytest.warns(UnicodeWarning): - with dlisio.load(path): - pass + prev_encodings = dlisio.get_encodings() + dlisio.set_encodings([]) + try: + f, = dlisio.load(path) + with pytest.warns(UnicodeWarning): + _ = f.match('.*', 'VERY_MUCH_TESTY_SET') + dlisio.set_encodings(['koi8_r']) - with dlisio.load(path) as (f, *_): - _ = f.object('VERY_MUCH_TESTY_SET', 'КАДР', 12, 4) + + objs = f.match('.*', 'VERY_MUCH_TESTY_SET') + [obj] = [x for x in objs] + assert obj.name == 'КАДР' finally: dlisio.set_encodings(prev_encodings) + f.close() -@pytest.mark.xfail(strict=True, reason="could not allocate string object") def test_broken_utf8_label(tmpdir, merge_files_oneLR): path = os.path.join(str(tmpdir), 'broken_utf8_label.dlis') content = [ @@ -111,19 +120,23 @@ def test_broken_utf8_label(tmpdir, merge_files_oneLR): 'data/chap3/object/object.dlis.part', ] merge_files_oneLR(path, content) - with pytest.warns(UnicodeWarning): - with dlisio.load(path): - pass + prev_encodings = dlisio.get_encodings() + dlisio.set_encodings([]) + try: + f, = dlisio.load(path) + obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) + + with pytest.warns(UnicodeWarning): + _ = obj.attic.keys() + dlisio.set_encodings(['koi8_r']) - with dlisio.load(path) as (f, *_): - obj = f.object('VERY_MUCH_TESTY_SET', 'OBJECT', 1, 1) - assert obj.attic['ДОХЛЫЙ-ПАРАМЕТР'] == ['Have a nice day!'] + assert 'ДОХЛЫЙ-ПАРАМЕТР' in obj.attic.keys() finally: + f.close() dlisio.set_encodings(prev_encodings) -@pytest.mark.xfail(strict=True, reason="fingerprint error on no encoding") @pytest.mark.future_test_set_names def test_broken_utf8_set(tmpdir, merge_files_oneLR): path = os.path.join(str(tmpdir), 'broken_utf8_set.dlis') @@ -134,16 +147,20 @@ def test_broken_utf8_set(tmpdir, merge_files_oneLR): 'data/chap3/object/object.dlis.part', ] merge_files_oneLR(path, content) - with pytest.warns(UnicodeWarning): - with dlisio.load(path) as (f, *_): - f.load() + prev_encodings = dlisio.get_encodings() + dlisio.set_encodings([]) + try: + f, = dlisio.load(path) + with pytest.warns(UnicodeWarning): + _ = f.metadata.types + dlisio.set_encodings(['koi8_r']) - with dlisio.load(path) as (f, *_): - _ = f.object('СЕТ_КИРИЛЛИЦЕЙ', 'OBJECT', 1, 1) - #assert set_name == 'МЕНЯ.ЗОВУТ.СЕТ' + assert 'СЕТ_КИРИЛЛИЦЕЙ' in f.metadata.types + #assert set_name == 'МЕНЯ.ЗОВУТ.СЕТ' finally: dlisio.set_encodings(prev_encodings) + f.close() From 3e6582e03a78d2e34281375b9133b62595e1199c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Thu, 23 Jul 2020 12:37:11 +0200 Subject: [PATCH 20/27] fixup! Set infocodes on dl::object_attribute when parsing --- lib/extension/dlisio/ext/types.hpp | 1 + lib/src/parse.cpp | 1 + python/dlisio/ext/core.cpp | 1 + 3 files changed, 3 insertions(+) diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index fe474f3a6..e3686651a 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -8,6 +8,7 @@ #include #include #include +#include #include diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index 6a631c81d..254c675d3 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index 5b56d2225..8e26d60f7 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include From f0b740234fca489d10a27c3875329c9d6de27208 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Fri, 24 Jul 2020 13:39:48 +0200 Subject: [PATCH 21/27] fixup! Let dl::pool::match use external matchers --- lib/extension/dlisio/ext/types.hpp | 6 +++--- lib/src/parse.cpp | 4 ++-- python/dlisio/ext/core.cpp | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index e3686651a..017b4f9bb 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -592,13 +592,13 @@ struct object_set { struct matcher { virtual bool match(const dl::ident& pattern, const dl::ident& candidate) - noexcept (false) = 0; + const noexcept (false) = 0; virtual ~matcher() = default; }; struct exactmatch : public matcher { bool match(const dl::ident& pattern, const dl::ident& candidate) - noexcept (false) override; + const noexcept (false) override; }; /* A queryable pool of metadata objects */ @@ -610,7 +610,7 @@ class pool { object_vector match(const std::string& type, const std::string& name, - dl::matcher& matcher); + const dl::matcher& matcher); dl::basic_object& at(const dl::ident&, const dl::obname&) noexcept (false); dl::basic_object& at(const dl::objref&) noexcept(false); diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index 254c675d3..cc8e89960 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -1124,7 +1124,7 @@ const char* parse_set_component( const char* cur, } bool exactmatch::match(const dl::ident& pattern, const dl::ident& candidate) -noexcept (false) { +const noexcept (false) { return pattern == candidate; } @@ -1227,7 +1227,7 @@ dl::basic_object& pool::at(const dl::objref& id ) noexcept (false) { object_vector pool::match( const std::string& type, const std::string& name, - dl::matcher& m) + const dl::matcher& m) noexcept (false) { object_vector objs; diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index 8e26d60f7..829ba8aca 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -707,7 +707,7 @@ class Pymatcher : public dl::matcher { /* Trampoline (need one for each virtual function) */ bool match(const dl::ident& pattern, const dl::ident& candidate) - noexcept(false) override { + const noexcept(false) override { PYBIND11_OVERLOAD_PURE( bool, /* Return type */ dl::matcher, /* Parent class */ From 785a1db19f3f32827a8309772836c54710b69ab2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Fri, 24 Jul 2020 10:33:52 +0200 Subject: [PATCH 22/27] Pass std::vector< dl::index_entry >& to findfdata Rather than filter the list for tells in python, pass the whole index to findfdata. --- lib/extension/dlisio/ext/io.hpp | 2 +- lib/src/io.cpp | 8 ++++---- python/dlisio/__init__.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/extension/dlisio/ext/io.hpp b/lib/extension/dlisio/ext/io.hpp index 31b49528b..f52109937 100644 --- a/lib/extension/dlisio/ext/io.hpp +++ b/lib/extension/dlisio/ext/io.hpp @@ -84,7 +84,7 @@ record& extract(stream&, long long, long long, record&) noexcept (false); dl::stream_offsets findoffsets(dl::stream&) noexcept (false); std::vector< std::pair< std::string, long long > > -findfdata(dl::stream&, const std::vector< long long >&) noexcept (false); +findfdata(dl::stream&, const std::vector< index_entry >&) noexcept (false); } diff --git a/lib/src/io.cpp b/lib/src/io.cpp index 92c259897..b01a31948 100644 --- a/lib/src/io.cpp +++ b/lib/src/io.cpp @@ -465,7 +465,7 @@ dl::stream_offsets findoffsets(dl::stream& file) noexcept (false) { } std::vector< std::pair< std::string, long long > > -findfdata(dl::stream& file, const std::vector< long long >& tells) +findfdata(dl::stream& file, const std::vector< index_entry >& index) noexcept (false) { std::vector< std::pair< std::string, long long > > xs; @@ -474,8 +474,8 @@ noexcept (false) { record rec; rec.data.reserve( OBNAME_SIZE_MAX ); - for (auto tell : tells) { - extract(file, tell, OBNAME_SIZE_MAX, rec); + for (auto iflr : index) { + extract(file, iflr.tell, OBNAME_SIZE_MAX, rec); if (rec.isencrypted()) continue; if (rec.type != 0) continue; if (rec.data.size() == 0) continue; @@ -494,7 +494,7 @@ noexcept (false) { dl::obname tmp{ dl::origin{ origin }, dl::ushort{ copy }, dl::ident{ std::string{ id, id + idlen } } }; - xs.emplace_back(tmp.fingerprint("FRAME"), tell); + xs.emplace_back(tmp.fingerprint("FRAME"), iflr.tell); } return xs; } diff --git a/python/dlisio/__init__.py b/python/dlisio/__init__.py index 6c1ba44a6..a9e254599 100644 --- a/python/dlisio/__init__.py +++ b/python/dlisio/__init__.py @@ -688,7 +688,7 @@ def rewind(offset, tif): metadata = core.metadata_pool(stream, explicits) fdata_index = defaultdict(list) - for key, val in core.findfdata(stream, [x.tell for x in implicits]): + for key, val in core.findfdata(stream, implicits): fdata_index[key].append(val) lf = dlis(stream, metadata, fdata_index, encrypted, sul) From 272a09d73c47ca9f641c9ffeb68b86c1114c30e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Fri, 24 Jul 2020 10:41:43 +0200 Subject: [PATCH 23/27] Make std::vector< dl::index_entry > an opaque type Because the memory layout of python lists and std::vector differs, pybind needs to do a copy operation during conversion. The index is passed back and forth between c++ and python, which results in at least two copies during load(). As the content of the index is never actually examined in python, these copies are completely unnecessary - and costly as the index may be quite large. Rendering them opaque means they can be passed by reference by pybind. --- python/dlisio/ext/core.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index 829ba8aca..0cf1ae066 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -737,6 +737,7 @@ void report( const std::vector< std::error_code >& codes, } PYBIND11_MAKE_OPAQUE( std::vector< dl::object_set > ) +PYBIND11_MAKE_OPAQUE( std::vector< dl::index_entry > ) PYBIND11_MODULE(core, m) { PyDateTime_IMPORT; @@ -754,6 +755,7 @@ PYBIND11_MODULE(core, m) { }); py::bind_vector>(m, "list(object_set)"); + py::bind_vector>(m, "list(index_entry)"); m.def("open", &dl::open, py::arg("path"), py::arg("zero") = 0); m.def("open_rp66", &dl::open_rp66); From 4a4c4af84eea63823f5cb7a538cd1f57830cae47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Fri, 24 Jul 2020 12:15:09 +0200 Subject: [PATCH 24/27] fixup! Set infocodes on dl::object_attribute when parsing --- lib/src/parse.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index cc8e89960..a54417e04 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -687,6 +687,7 @@ const noexcept (true) { && this->count == o.count && this->reprc == o.reprc && this->units == o.units + && this->info == o.info // invariant doesn't matter for attribute equality, // so ignore it && value_variant_eq(this->value, o.value); From b99f75f4d6319e9a7c630e1823d2372118b63a07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Fri, 24 Jul 2020 12:41:28 +0200 Subject: [PATCH 25/27] Use dlis.object when creating obj-to-obj links Objects are no longer cached in Python, so calling dlis.__getitem__ creates an unnecessary amount of python objects compared to dlis.object. --- python/dlisio/plumbing/linkage.py | 14 +++++++------- python/tests/test_basic_object.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python/dlisio/plumbing/linkage.py b/python/dlisio/plumbing/linkage.py index dde03934e..5aefc3750 100644 --- a/python/dlisio/plumbing/linkage.py +++ b/python/dlisio/plumbing/linkage.py @@ -5,28 +5,28 @@ def obname(objtype): def fingerprint(obj): if not isinstance(obj, core.obname): raise TypeError - return obj.fingerprint(objtype), objtype + return obj, objtype return fingerprint def objref(obj): if not isinstance(obj, core.objref): raise TypeError - return obj.fingerprint, obj.type + return obj.name, obj.type def lookup(lf, reftype, value): """Create a fingerprint from reftype(value) and look up corresponding object in the logical file.""" try: - fp, objtype = reftype(value) + name, objtype = reftype(value) except TypeError: msg = "Unable to create object-reference to '{}'" logging.warning(msg.format(value)) return None try: - return lf[objtype][fp] - except KeyError: - msg = "Referenced object '{}' not in logical file" - logging.warning(msg.format(fp)) + return lf.object(objtype, name.id, name.origin, name.copynumber) + except ValueError as e: + msg = "Unable to find linked object: {}" + logging.warning(msg.format(str(e))) return None def isreference(val): diff --git a/python/tests/test_basic_object.py b/python/tests/test_basic_object.py index 1d5cb22e1..05c6a80ec 100644 --- a/python/tests/test_basic_object.py +++ b/python/tests/test_basic_object.py @@ -60,7 +60,7 @@ def test_lookup_no_such_object(f, assert_log): res = lookup(f, linkage.obname('CHANNEL'), value) assert res is None - assert_log('not in logical file') + assert_log('Unable to find linked object') @pytest.mark.xfail(strict=True, reason="attempt to link empty fingerprint") def test_link_empty_object(tmpdir_factory, merge_files_manyLR): From 12c69a14ff447dbc25a137fc9e4369b2c165eeb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Fri, 24 Jul 2020 13:08:07 +0200 Subject: [PATCH 26/27] Create fdata_index in findfdata Creating the map directly in findfdata reduces the amount of strings that has to be returned to python. On my machine, the amount of time it takes to create the fdata index is reduced with about 6%. --- lib/extension/dlisio/ext/io.hpp | 3 ++- lib/src/io.cpp | 19 ++++++++++++++++--- python/dlisio/__init__.py | 4 +--- python/dlisio/dlisutils.py | 6 +++++- 4 files changed, 24 insertions(+), 8 deletions(-) diff --git a/lib/extension/dlisio/ext/io.hpp b/lib/extension/dlisio/ext/io.hpp index f52109937..2f32b4c5c 100644 --- a/lib/extension/dlisio/ext/io.hpp +++ b/lib/extension/dlisio/ext/io.hpp @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -83,7 +84,7 @@ record& extract(stream&, long long, long long, record&) noexcept (false); dl::stream_offsets findoffsets(dl::stream&) noexcept (false); -std::vector< std::pair< std::string, long long > > +std::map< std::string, std::vector< long long > > findfdata(dl::stream&, const std::vector< index_entry >&) noexcept (false); } diff --git a/lib/src/io.cpp b/lib/src/io.cpp index b01a31948..468375d44 100644 --- a/lib/src/io.cpp +++ b/lib/src/io.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -464,10 +465,10 @@ dl::stream_offsets findoffsets(dl::stream& file) noexcept (false) { return idx; } -std::vector< std::pair< std::string, long long > > +std::map< std::string, std::vector< long long > > findfdata(dl::stream& file, const std::vector< index_entry >& index) noexcept (false) { - std::vector< std::pair< std::string, long long > > xs; + std::map< std::string, std::vector< long long > > xs; constexpr std::size_t OBNAME_SIZE_MAX = 262; @@ -494,7 +495,19 @@ noexcept (false) { dl::obname tmp{ dl::origin{ origin }, dl::ushort{ copy }, dl::ident{ std::string{ id, id + idlen } } }; - xs.emplace_back(tmp.fingerprint("FRAME"), iflr.tell); + std::string fp = tmp.fingerprint("FRAME"); + + /* Although index.size() often is fairly large, the number of + * unique frames are typically just an handful. Hence the repeated + * calls to find() are not so bad as it might seem at first glace. + */ + const auto itr = xs.find(fp); + if (itr == xs.end()) { + xs.insert( { fp, { iflr.tell } }); + } + else { + itr->second.push_back( iflr.tell ); + } } return xs; } diff --git a/python/dlisio/__init__.py b/python/dlisio/__init__.py index a9e254599..80e77fc68 100644 --- a/python/dlisio/__init__.py +++ b/python/dlisio/__init__.py @@ -687,9 +687,7 @@ def rewind(offset, tif): hint = rewind(stream.absolute_tell, tapemarks) metadata = core.metadata_pool(stream, explicits) - fdata_index = defaultdict(list) - for key, val in core.findfdata(stream, implicits): - fdata_index[key].append(val) + fdata_index = core.findfdata(stream, implicits) lf = dlis(stream, metadata, fdata_index, encrypted, sul) lfs.append(lf) diff --git a/python/dlisio/dlisutils.py b/python/dlisio/dlisutils.py index f1adc5b5a..de932657b 100644 --- a/python/dlisio/dlisutils.py +++ b/python/dlisio/dlisutils.py @@ -10,7 +10,11 @@ def curves(dlis, frame, dtype, pre_fmt, fmt, post_fmt): Reads curves for provided frame and position defined by frame format: pre_fmt (to skip), fmt (to read), post_fmt (to skip) """ - indices = dlis.fdata_index[frame.fingerprint] + try: + indices = dlis.fdata_index[frame.fingerprint] + except KeyError: + indices = list() + alloc = lambda size: np.empty(shape = size, dtype = dtype) return core.read_fdata( pre_fmt, From 028adfb427f1efb6e08455c4eff715b8ee719fed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erlend=20H=C3=A5rstad?= Date: Fri, 24 Jul 2020 13:20:22 +0200 Subject: [PATCH 27/27] Add a dl::pool::match that only matches on type Having an overload for just type means we are not restricted to use a regex matcher when just querying for type. It's rather common query, and it's faster to to with an exact matcher, compared to regex match on name which we don't care about anyway. --- lib/extension/dlisio/ext/types.hpp | 3 +++ lib/src/parse.cpp | 13 +++++++++++++ python/dlisio/__init__.py | 2 +- python/dlisio/ext/core.cpp | 10 +++++++++- 4 files changed, 26 insertions(+), 2 deletions(-) diff --git a/lib/extension/dlisio/ext/types.hpp b/lib/extension/dlisio/ext/types.hpp index 017b4f9bb..9c8b3a630 100644 --- a/lib/extension/dlisio/ext/types.hpp +++ b/lib/extension/dlisio/ext/types.hpp @@ -612,6 +612,9 @@ class pool { const std::string& name, const dl::matcher& matcher); + object_vector match(const std::string& type, + const dl::matcher& matcher) noexcept (false); + dl::basic_object& at(const dl::ident&, const dl::obname&) noexcept (false); dl::basic_object& at(const dl::objref&) noexcept(false); private: diff --git a/lib/src/parse.cpp b/lib/src/parse.cpp index a54417e04..b5fcc9d68 100644 --- a/lib/src/parse.cpp +++ b/lib/src/parse.cpp @@ -1244,4 +1244,17 @@ noexcept (false) { return objs; } +object_vector pool::match( const std::string& type, + const dl::matcher& m) +noexcept (false) { + object_vector objs; + + for (auto& eflr : this->eflrs) { + if (not m.match(dl::ident{type}, eflr.type)) continue; + auto tmp = eflr.objects(); + objs.insert(objs.end(), tmp.begin(), tmp.end()); + } + return objs; +} + } diff --git a/python/dlisio/__init__.py b/python/dlisio/__init__.py index 80e77fc68..2f45b788b 100644 --- a/python/dlisio/__init__.py +++ b/python/dlisio/__init__.py @@ -252,7 +252,7 @@ def __getitem__(self, type): objects : dict all objects of type 'type' """ - objs = self.metadata.match('^{}$'.format(type), '.*', self.matcher) + objs = self.metadata.match(type, core.exactmatch()) return { x.fingerprint : x for x in self.promote(objs) } def __enter__(self): diff --git a/python/dlisio/ext/core.cpp b/python/dlisio/ext/core.cpp index 0cf1ae066..05e7c4e59 100644 --- a/python/dlisio/ext/core.cpp +++ b/python/dlisio/ext/core.cpp @@ -918,7 +918,15 @@ PYBIND11_MODULE(core, m) { py::class_< dl::pool >( m, "logical_file" ) .def_property_readonly( "types", &dl::pool::types ) - .def( "match", &dl::pool::match ) + .def( "match", (dl::object_vector (dl::pool::*) ( + const std::string&, + const std::string&, + const dl::matcher& + )) &dl::pool::match ) + .def( "match", (dl::object_vector (dl::pool::*) ( + const std::string&, + const dl::matcher& + )) &dl::pool::match ) ; py::enum_< dl::representation_code >( m, "reprc" )