forked from activeloopai/deeplake
-
Notifications
You must be signed in to change notification settings - Fork 0
/
download_cola.py
65 lines (50 loc) · 1.54 KB
/
download_cola.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""Download handler for CoLA dataset"""
from hub import transform
from hub.schema import Primitive, Text
import zipfile
import requests
import pandas as pd
from Fast import Dataset
class Retrieve(Dataset):
def __init__(self, url: str, tag: str, schema: dict):
self.temp = "temp"
self.url = url
self.tag = tag
self.schema = schema
def fetch(self):
r = requests.get(self.url)
with open(self.temp, "wb") as f:
f.write(r.content)
def unpack(self):
with zipfile.ZipFile(self.temp, "r") as z:
z.extractall()
def push(self):
# read data into memory
df = pd.read_csv(
"./cola_public/raw/in_domain_train.tsv",
sep="\t",
header=None,
usecols=[1, 3],
names=["label", "sentence"],
)
sentences = list(df.sentence.values)
labels = list(df.label.values)
data = list(zip(sentences, labels))
@transform(schema=self.schema)
def load_transform(sample):
return {"sentence": sample[0], "labels": sample[1]}
ds = load_transform(data)
return ds.store(self.tag)
def main(url, tag, schema):
R = Retrieve(url, tag, schema)
R.fetch()
R.unpack()
R.push()
if __name__ == "__main__":
url = "https://nyu-mll.github.io/CoLA/cola_public_1.1.zip"
tag = "activeloop/CoLA"
schema = {
"sentence": Text(shape=(None,), max_shape=(500,)),
"labels": Primitive(dtype="int64"),
}
main(url, tag, schema)