Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding option to export per-sample JSON in FiftyOneDataset format #2028

Merged
merged 26 commits into from
Aug 30, 2022
Merged
Changes from 1 commit
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
921dff7
bumps, release notes
benjaminpkane Aug 19, 2022
628f012
add to release notes
benjaminpkane Aug 19, 2022
6363b62
linting
benjaminpkane Aug 19, 2022
ad9c90f
Merge branch 'develop' into release/v0.16.6
brimoor Aug 24, 2022
3d9d59a
updating pkg versions
brimoor Aug 24, 2022
87aefe1
removing unnecessary pull
brimoor Aug 24, 2022
07a7941
Merge branch 'develop' into release/v0.16.6
brimoor Aug 24, 2022
4e4a30b
move App build to end
brimoor Aug 25, 2022
ae022c3
updating release notes
brimoor Aug 25, 2022
38a9204
always migrate when user is admin
brimoor Aug 25, 2022
f8efbaa
moving legacy troubleshooting to another page
brimoor Aug 25, 2022
6bedc9c
adding docs on coordinating migrations
brimoor Aug 25, 2022
ef536b1
adding helpful error when a dataset fails to load
brimoor Aug 25, 2022
5eb1cf3
initial work
brimoor Aug 25, 2022
ff8760a
pinning max requirement
brimoor Aug 25, 2022
f5f671b
using more cloud-friendly layout
brimoor Aug 25, 2022
c6b5387
adding upgrade note
brimoor Aug 25, 2022
d648803
Merge branch 'release/v0.16.6' into feature/fiftyone-dataset-update
brimoor Aug 25, 2022
aa40130
adding support for rendering progress bars in DynamicBatcher
brimoor Aug 26, 2022
dae6ce6
using DynamicBatcher's progress feature
brimoor Aug 26, 2022
613007f
adding support for importing/exporting directories of docs
brimoor Aug 26, 2022
9e58330
reduce max limit
brimoor Aug 26, 2022
7e4829f
adding support for importing/exporting per-frame directories in Fifty…
brimoor Aug 26, 2022
44b4f9c
adding use_dirs unit test
brimoor Aug 26, 2022
8b98239
documenting use_dirs option
brimoor Aug 26, 2022
04109aa
Merge branch 'develop' into feature/fiftyone-dataset-update
brimoor Aug 26, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
adding support for importing/exporting directories of docs
  • Loading branch information
brimoor committed Aug 26, 2022
commit 613007ffda05762cf3f5d3a6ee4e9debc7e7e771
99 changes: 88 additions & 11 deletions fiftyone/core/odm/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,19 +534,39 @@ def export_document(doc, json_path):
etau.write_file(json_util.dumps(doc), json_path)


def export_collection(docs, json_path, key="documents", num_docs=None):
def export_collection(
docs,
json_dir_or_path,
key="documents",
patt="{idx:06d}-{id}.json",
num_docs=None,
):
"""Exports the collection to disk in JSON format.

Args:
docs: an iteraable containing the documents to export
json_path: the path to write the JSON file
docs: an iterable containing the documents to export
json_dir_or_path: the path to write a single JSON file containing the
entire collection, or a directory in which to write per-document
JSON files
key ("documents"): the field name under which to store the documents
when ``json_path`` is a single JSON file
patt ("{idx:06d}-{id}.json"): a filename pattern to use when
``json_path`` is a directory. The pattern may contain ``idx`` to
refer to the index of the document in ``docs`` or ``id`` to refer
to the document's ID
num_docs (None): the total number of documents. If omitted, this must
be computable via ``len(docs)``
"""
if num_docs is None:
num_docs = len(docs)

if json_dir_or_path.endswith(".json"):
_export_collection_single(docs, json_dir_or_path, key, num_docs)
else:
_export_collection_multi(docs, json_dir_or_path, patt, num_docs)


def _export_collection_single(docs, json_path, key, num_docs):
etau.ensure_basedir(json_path)

with open(json_path, "w") as f:
Expand All @@ -560,6 +580,16 @@ def export_collection(docs, json_path, key="documents", num_docs=None):
f.write("]}")


def _export_collection_multi(docs, json_dir, patt, num_docs):
etau.ensure_dir(json_dir)

json_patt = os.path.join(json_dir, patt)
with fou.ProgressBar(total=num_docs, iters_str="docs") as pb:
for idx, doc in pb(enumerate(docs, 1)):
json_path = json_patt.format(idx=idx, id=str(doc["_id"]))
export_document(doc, json_path)


def import_document(json_path):
"""Imports a document from JSON on disk.

Expand All @@ -573,27 +603,49 @@ def import_document(json_path):
return json_util.loads(f.read())


def import_collection(json_path, key="documents"):
def import_collection(json_dir_or_path, key="documents"):
"""Imports the collection from JSON on disk.

Args:
json_path: the path to the collection on disk
json_dir_or_path: the path to a JSON file on disk, or a directory
containing per-document JSON files
key ("documents"): the field name under which the documents are stored
when ``json_path`` is a single JSON file

Returns:
a tuple of

- the list of BSON documents
- an iterable of BSON documents
- the number of documents
"""
if json_dir_or_path.endswith(".json"):
return _import_collection_single(json_dir_or_path, key)

return _import_collection_multi(json_dir_or_path)


def _import_collection_single(json_path, key):
with open(json_path, "r") as f:
docs = json_util.loads(f.read()).get(key, [])

return docs, len(docs)
num_docs = len(docs)

return docs, num_docs

def insert_documents(docs, coll, ordered=False):
"""Inserts a list of documents into a collection.

def _import_collection_multi(json_dir):
json_paths = [
p
for p in etau.list_files(json_dir, abs_paths=True)
if p.endswith(".json")
]
docs = map(import_document, json_paths)

return docs, len(json_paths)


def insert_documents(docs, coll, ordered=False, progress=False, num_docs=None):
"""Inserts documents into a collection.

The ``_id`` field of the input documents will be populated if it is not
already set.
Expand All @@ -602,14 +654,39 @@ def insert_documents(docs, coll, ordered=False):
docs: an iterable of BSON document dicts
coll: a pymongo collection
ordered (False): whether the documents must be inserted in order
progress (False): whether to render a progress bar tracking the
insertion
num_docs (None): the total number of documents. Only used when
``progress=True``. If omitted, this will be computed via
``len(docs)``, if possible

Returns:
a list of IDs of the inserted documents
"""
ids = []

try:
for batch in fou.iter_batches(docs, 100000): # mongodb limit
coll.insert_many(list(batch), ordered=ordered)
batcher = fou.DynamicBatcher(
docs,
target_latency=0.2,
init_batch_size=1,
max_batch_beta=2.0,
max_batch_size=100000, # mongodb limit
progress=progress,
total=num_docs,
)

with batcher:
for batch in batcher:
batch = list(batch)
coll.insert_many(batch, ordered=ordered)
ids.extend(b["_id"] for b in batch)
except BulkWriteError as bwe:
msg = bwe.details["writeErrors"][0]["errmsg"]
raise ValueError(msg) from bwe

return ids


def bulk_write(ops, coll, ordered=False):
"""Performs a batch of write operations on a collection.
Expand Down