Autogoal.datasets. init
import shutil
from pathlib import Path
from typing import Dict
import json
import requests
import os
from tqdm import tqdm
from functools import lru_cache
DATASETS_METADATA = (
"https://raw.githubusercontent.com/autogoal/datasets/master/datasets.json"
)
DATA_PATH = Path.home() / ".autogoal" / "data"
ensure data path directory creation
os.makedirs(DATA_PATH, exist_ok=True)
@lru_cache()
def get_datasets_list() -> Dict[str, str]:
try:
data = requests.get(DATASETS_METADATA).json()
with open(DATA_PATH / "datasets.json", "w") as fp:
json.dump(data, fp, indent=2)
return data
except requests.ConnectionError as e:
try:
with open(DATA_PATH / "datasets.json", "r") as fp:
return json.load(fp)
except IOError:
raise Exception(
"Cannot download dataset list and no cached version exists."
)
def datapath(path: str) -> Path:
Note
Returns a Path
object that points to the dataset path
where path
is located.
Examples¶
```python
datapath("movie_reviews") PosixPath('/home/coder/.autogoal/data/movie_reviews')
```
return Path(DATA_PATH) / path
def pack(folder: str):
filename = datapath(folder)
rootdir = datapath(folder)
shutil.make_archive(filename, "zip", root_dir=rootdir)
def unpack(zipfile: str):
filename = datapath(zipfile)
rootdir = datapath(zipfile[:-4])
shutil.unpack_archive(filename, extract_dir=rootdir, format="zip")
def download(dataset: str, unpackit: bool = True):
fname = f"{dataset}.zip"
path = datapath(fname)
if path.exists():
return
datasets = get_datasets_list()
url = datasets[dataset]
download_and_save(url, path, True)
if unpackit:
unpack(fname)
def download_and_save(url, path: Path, overwrite=False, data_length=None):
stream = requests.get(url, stream=True)
total_size = data_length or int(stream.headers.get("content-length", 0))
if path.exists() and not overwrite:
return False
try:
with path.open("wb") as f:
with tqdm(
total=total_size, unit="B", unit_scale=True, unit_divisor=1024
) as pbar:
for data in stream.iter_content(32 * 1024):
f.write(data)
pbar.update(len(data))
return True
except:
path.unlink()
raise