-- This patch adds tests/fixtures/files.py which is missing from the PyPI source distribution. -- The file is taken from the GitHub repository at the same version tag. -- Without this file, the test suite cannot be run. --- /dev/null +++ tests/fixtures/files.py @@ -0,0 +1,630 @@ +import contextlib +import csv +import json +import os +import sqlite3 +import tarfile +import textwrap +import zipfile + +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq +import pytest + +import datasets +import datasets.config + + +# dataset + arrow_file + + +@pytest.fixture(scope="session") +def dataset(): + n = 10 + features = datasets.Features( + { + "tokens": datasets.List(datasets.Value("string")), + "labels": datasets.List(datasets.ClassLabel(names=["negative", "positive"])), + "answers": { + "text": datasets.List(datasets.Value("string")), + "answer_start": datasets.List(datasets.Value("int32")), + }, + "id": datasets.Value("int64"), + } + ) + dataset = datasets.Dataset.from_dict( + { + "tokens": [["foo"] * 5] * n, + "labels": [[1] * 5] * n, + "answers": [{"answer_start": [97], "text": ["1976"]}] * 10, + "id": list(range(n)), + }, + features=features, + ) + return dataset + + +@pytest.fixture(scope="session") +def arrow_file(tmp_path_factory, dataset): + filename = str(tmp_path_factory.mktemp("data") / "file.arrow") + dataset.map(cache_file_name=filename) + return filename + + +# FILE_CONTENT + files + + +FILE_CONTENT = """\ + Text data. + Second line of data.""" + + +@pytest.fixture(scope="session") +def text_file_content(): + return FILE_CONTENT + + +@pytest.fixture(scope="session") +def text_file(tmp_path_factory): + filename = tmp_path_factory.mktemp("data") / "file.txt" + data = FILE_CONTENT + with open(filename, "w") as f: + f.write(data) + return filename + + +@pytest.fixture(scope="session") +def bz2_file(tmp_path_factory): + import bz2 + + path = tmp_path_factory.mktemp("data") / "file.txt.bz2" + data = bytes(FILE_CONTENT, "utf-8") + with bz2.open(path, "wb") as f: + f.write(data) + return path + + +@pytest.fixture(scope="session") +def gz_file(tmp_path_factory): + import gzip + + path = str(tmp_path_factory.mktemp("data") / "file.txt.gz") + data = bytes(FILE_CONTENT, "utf-8") + with gzip.open(path, "wb") as f: + f.write(data) + return path + + +@pytest.fixture(scope="session") +def lz4_file(tmp_path_factory): + if datasets.config.LZ4_AVAILABLE: + import lz4.frame + + path = tmp_path_factory.mktemp("data") / "file.txt.lz4" + data = bytes(FILE_CONTENT, "utf-8") + with lz4.frame.open(path, "wb") as f: + f.write(data) + return path + + +@pytest.fixture(scope="session") +def seven_zip_file(tmp_path_factory, text_file): + if datasets.config.PY7ZR_AVAILABLE: + import py7zr + + path = tmp_path_factory.mktemp("data") / "file.txt.7z" + with py7zr.SevenZipFile(path, "w") as archive: + archive.write(text_file, arcname=os.path.basename(text_file)) + return path + + +@pytest.fixture(scope="session") +def tar_file(tmp_path_factory, text_file): + import tarfile + + path = tmp_path_factory.mktemp("data") / "file.txt.tar" + with tarfile.TarFile(path, "w") as f: + f.add(text_file, arcname=os.path.basename(text_file)) + return path + + +@pytest.fixture(scope="session") +def xz_file(tmp_path_factory): + import lzma + + path = tmp_path_factory.mktemp("data") / "file.txt.xz" + data = bytes(FILE_CONTENT, "utf-8") + with lzma.open(path, "wb") as f: + f.write(data) + return path + + +@pytest.fixture(scope="session") +def zip_file(tmp_path_factory, text_file): + import zipfile + + path = tmp_path_factory.mktemp("data") / "file.txt.zip" + with zipfile.ZipFile(path, "w") as f: + f.write(text_file, arcname=os.path.basename(text_file)) + return path + + +@pytest.fixture(scope="session") +def zstd_file(tmp_path_factory): + if datasets.config.ZSTANDARD_AVAILABLE: + import zstandard as zstd + + path = tmp_path_factory.mktemp("data") / "file.txt.zst" + data = bytes(FILE_CONTENT, "utf-8") + with zstd.open(path, "wb") as f: + f.write(data) + return path + + +# xml_file + + +@pytest.fixture(scope="session") +def xml_file(tmp_path_factory): + filename = tmp_path_factory.mktemp("data") / "file.xml" + data = textwrap.dedent( + """\ + + +
+ + + Contingut 1 + Content 1 + + + Contingut 2 + Content 2 + + + Contingut 3 + Content 3 + + + Contingut 4 + Content 4 + + + Contingut 5 + Content 5 + + + """ + ) + with open(filename, "w") as f: + f.write(data) + return filename + + +DATA = [ + {"col_1": "0", "col_2": 0, "col_3": 0.0}, + {"col_1": "1", "col_2": 1, "col_3": 1.0}, + {"col_1": "2", "col_2": 2, "col_3": 2.0}, + {"col_1": "3", "col_2": 3, "col_3": 3.0}, +] +DATA2 = [ + {"col_1": "4", "col_2": 4, "col_3": 4.0}, + {"col_1": "5", "col_2": 5, "col_3": 5.0}, +] +DATA_DICT_OF_LISTS = { + "col_1": ["0", "1", "2", "3"], + "col_2": [0, 1, 2, 3], + "col_3": [0.0, 1.0, 2.0, 3.0], +} + +DATA_312 = [ + {"col_3": 0.0, "col_1": "0", "col_2": 0}, + {"col_3": 1.0, "col_1": "1", "col_2": 1}, +] + +DATA_STR = [ + {"col_1": "s0", "col_2": 0, "col_3": 0.0}, + {"col_1": "s1", "col_2": 1, "col_3": 1.0}, + {"col_1": "s2", "col_2": 2, "col_3": 2.0}, + {"col_1": "s3", "col_2": 3, "col_3": 3.0}, +] + +DATA_MISSING_FIELDS = [ + {"col_1": 1, "col_2": 2}, + {"col_1": 1, "col_3": 3}, +] + +DATA_MIXED_TYPES = [ + {"col_1": 1, "col_2": {"a": "a"}, "col_3": [{"x": "x"}]}, + {"col_1": "one", "col_2": {"b": "b"}, "col_3": [{"y": "y"}]}, + {"col_1": None, "col_2": None, "col_3": [None]}, +] + + +@pytest.fixture(scope="session") +def dataset_dict(): + return DATA_DICT_OF_LISTS + + +@pytest.fixture(scope="session") +def arrow_path(tmp_path_factory): + dataset = datasets.Dataset.from_dict(DATA_DICT_OF_LISTS) + path = str(tmp_path_factory.mktemp("data") / "dataset.arrow") + dataset.map(cache_file_name=path) + return path + + +@pytest.fixture(scope="session") +def sqlite_path(tmp_path_factory): + path = str(tmp_path_factory.mktemp("data") / "dataset.sqlite") + with contextlib.closing(sqlite3.connect(path)) as con: + cur = con.cursor() + cur.execute("CREATE TABLE dataset(col_1 text, col_2 int, col_3 real)") + for item in DATA: + cur.execute("INSERT INTO dataset(col_1, col_2, col_3) VALUES (?, ?, ?)", tuple(item.values())) + con.commit() + return path + + +@pytest.fixture(scope="session") +def csv_path(tmp_path_factory): + path = str(tmp_path_factory.mktemp("data") / "dataset.csv") + with open(path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=["col_1", "col_2", "col_3"]) + writer.writeheader() + for item in DATA: + writer.writerow(item) + return path + + +@pytest.fixture(scope="session") +def csv2_path(tmp_path_factory): + path = str(tmp_path_factory.mktemp("data") / "dataset2.csv") + with open(path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=["col_1", "col_2", "col_3"]) + writer.writeheader() + for item in DATA: + writer.writerow(item) + return path + + +@pytest.fixture(scope="session") +def bz2_csv_path(csv_path, tmp_path_factory): + import bz2 + + path = tmp_path_factory.mktemp("data") / "dataset.csv.bz2" + with open(csv_path, "rb") as f: + data = f.read() + # data = bytes(FILE_CONTENT, "utf-8") + with bz2.open(path, "wb") as f: + f.write(data) + return path + + +@pytest.fixture(scope="session") +def zip_csv_path(csv_path, csv2_path, tmp_path_factory): + path = tmp_path_factory.mktemp("zip_csv_path") / "csv-dataset.zip" + with zipfile.ZipFile(path, "w") as f: + f.write(csv_path, arcname=os.path.basename(csv_path)) + f.write(csv2_path, arcname=os.path.basename(csv2_path)) + return path + + +@pytest.fixture(scope="session") +def zip_uppercase_csv_path(csv_path, csv2_path, tmp_path_factory): + path = tmp_path_factory.mktemp("data") / "dataset.csv.zip" + with zipfile.ZipFile(path, "w") as f: + f.write(csv_path, arcname=os.path.basename(csv_path.replace(".csv", ".CSV"))) + f.write(csv2_path, arcname=os.path.basename(csv2_path.replace(".csv", ".CSV"))) + return path + + +@pytest.fixture(scope="session") +def zip_csv_with_dir_path(csv_path, csv2_path, tmp_path_factory): + path = tmp_path_factory.mktemp("data") / "dataset_with_dir.csv.zip" + with zipfile.ZipFile(path, "w") as f: + f.write(csv_path, arcname=os.path.join("main_dir", os.path.basename(csv_path))) + f.write(csv2_path, arcname=os.path.join("main_dir", os.path.basename(csv2_path))) + return path + + +@pytest.fixture(scope="session") +def parquet_path(tmp_path_factory): + path = str(tmp_path_factory.mktemp("data") / "dataset.parquet") + schema = pa.schema( + { + "col_1": pa.string(), + "col_2": pa.int64(), + "col_3": pa.float64(), + } + ) + with open(path, "wb") as f: + writer = pq.ParquetWriter(f, schema=schema) + pa_table = pa.Table.from_pydict({k: [DATA[i][k] for i in range(len(DATA))] for k in DATA[0]}, schema=schema) + writer.write_table(pa_table) + writer.close() + return path + + +@pytest.fixture(scope="session") +def geoparquet_path(tmp_path_factory): + df = pd.read_parquet(path="https://github.com/opengeospatial/geoparquet/raw/v1.0.0/examples/example.parquet") + path = str(tmp_path_factory.mktemp("data") / "dataset.geoparquet") + df.to_parquet(path=path) + return path + + +@pytest.fixture(scope="session") +def json_list_of_dicts_path(tmp_path_factory): + path = str(tmp_path_factory.mktemp("data") / "dataset.json") + data = {"data": DATA} + with open(path, "w") as f: + json.dump(data, f) + return path + + +@pytest.fixture(scope="session") +def json_dict_of_lists_path(tmp_path_factory): + path = str(tmp_path_factory.mktemp("data") / "dataset.json") + data = {"data": DATA_DICT_OF_LISTS} + with open(path, "w") as f: + json.dump(data, f) + return path + + +@pytest.fixture(scope="session") +def jsonl_path(tmp_path_factory): + path = str(tmp_path_factory.mktemp("data") / "dataset.jsonl") + with open(path, "w") as f: + for item in DATA: + f.write(json.dumps(item) + "\n") + return path + + +@pytest.fixture(scope="session") +def jsonl2_path(tmp_path_factory): + path = str(tmp_path_factory.mktemp("data") / "dataset2.jsonl") + with open(path, "w") as f: + for item in DATA: + f.write(json.dumps(item) + "\n") + return path + + +@pytest.fixture(scope="session") +def jsonl_312_path(tmp_path_factory): + path = str(tmp_path_factory.mktemp("data") / "dataset_312.jsonl") + with open(path, "w") as f: + for item in DATA_312: + f.write(json.dumps(item) + "\n") + return path + + +@pytest.fixture(scope="session") +def jsonl_str_path(tmp_path_factory): + path = str(tmp_path_factory.mktemp("data") / "dataset-str.jsonl") + with open(path, "w") as f: + for item in DATA_STR: + f.write(json.dumps(item) + "\n") + return path + + +@pytest.fixture(scope="session") +def jsonl_missing_fields_path(tmp_path_factory): + path = str(tmp_path_factory.mktemp("data") / "dataset-missing-fields.jsonl") + with open(path, "w") as f: + for item in DATA_MISSING_FIELDS: + f.write(json.dumps(item) + "\n") + return path + + +@pytest.fixture(scope="session") +def jsonl_mixed_types_path(tmp_path_factory): + path = str(tmp_path_factory.mktemp("data") / "dataset-mixed-types.jsonl") + with open(path, "w") as f: + for item in DATA_MIXED_TYPES: + f.write(json.dumps(item) + "\n") + return path + + +@pytest.fixture(scope="session") +def text_gz_path(tmp_path_factory, text_path): + import gzip + + path = str(tmp_path_factory.mktemp("data") / "dataset.txt.gz") + with open(text_path, "rb") as orig_file: + with gzip.open(path, "wb") as zipped_file: + zipped_file.writelines(orig_file) + return path + + +@pytest.fixture(scope="session") +def jsonl_gz_path(tmp_path_factory, jsonl_path): + import gzip + + path = str(tmp_path_factory.mktemp("data") / "dataset.jsonl.gz") + with open(jsonl_path, "rb") as orig_file: + with gzip.open(path, "wb") as zipped_file: + zipped_file.writelines(orig_file) + return path + + +@pytest.fixture(scope="session") +def zip_jsonl_path(jsonl_path, jsonl2_path, tmp_path_factory): + path = tmp_path_factory.mktemp("data") / "dataset.jsonl.zip" + with zipfile.ZipFile(path, "w") as f: + f.write(jsonl_path, arcname=os.path.basename(jsonl_path)) + f.write(jsonl2_path, arcname=os.path.basename(jsonl2_path)) + return path + + +@pytest.fixture(scope="session") +def zip_nested_jsonl_path(zip_jsonl_path, jsonl_path, jsonl2_path, tmp_path_factory): + path = tmp_path_factory.mktemp("data") / "dataset_nested.jsonl.zip" + with zipfile.ZipFile(path, "w") as f: + f.write(zip_jsonl_path, arcname=os.path.join("nested", os.path.basename(zip_jsonl_path))) + return path + + +@pytest.fixture(scope="session") +def zip_jsonl_with_dir_path(jsonl_path, jsonl2_path, tmp_path_factory): + path = tmp_path_factory.mktemp("data") / "dataset_with_dir.jsonl.zip" + with zipfile.ZipFile(path, "w") as f: + f.write(jsonl_path, arcname=os.path.join("main_dir", os.path.basename(jsonl_path))) + f.write(jsonl2_path, arcname=os.path.join("main_dir", os.path.basename(jsonl2_path))) + return path + + +@pytest.fixture(scope="session") +def tar_jsonl_path(jsonl_path, jsonl2_path, tmp_path_factory): + path = tmp_path_factory.mktemp("data") / "dataset.jsonl.tar" + with tarfile.TarFile(path, "w") as f: + f.add(jsonl_path, arcname=os.path.basename(jsonl_path)) + f.add(jsonl2_path, arcname=os.path.basename(jsonl2_path)) + return path + + +@pytest.fixture(scope="session") +def tar_nested_jsonl_path(tar_jsonl_path, jsonl_path, jsonl2_path, tmp_path_factory): + path = tmp_path_factory.mktemp("data") / "dataset_nested.jsonl.tar" + with tarfile.TarFile(path, "w") as f: + f.add(tar_jsonl_path, arcname=os.path.join("nested", os.path.basename(tar_jsonl_path))) + return path + + +@pytest.fixture(scope="session") +def text_path(tmp_path_factory): + data = ["0", "1", "2", "3"] + path = str(tmp_path_factory.mktemp("data") / "dataset.txt") + with open(path, "w") as f: + for item in data: + f.write(item + "\n") + return path + + +@pytest.fixture(scope="session") +def text2_path(tmp_path_factory): + data = ["0", "1", "2", "3"] + path = str(tmp_path_factory.mktemp("data") / "dataset2.txt") + with open(path, "w") as f: + for item in data: + f.write(item + "\n") + return path + + +@pytest.fixture(scope="session") +def text_dir(tmp_path_factory): + data = ["0", "1", "2", "3"] + path = tmp_path_factory.mktemp("data_text_dir") / "dataset.txt" + with open(path, "w") as f: + for item in data: + f.write(item + "\n") + return path.parent + + +@pytest.fixture(scope="session") +def text_dir_with_unsupported_extension(tmp_path_factory): + data = ["0", "1", "2", "3"] + path = tmp_path_factory.mktemp("data") / "dataset.abc" + with open(path, "w") as f: + for item in data: + f.write(item + "\n") + return path + + +@pytest.fixture(scope="session") +def zip_text_path(text_path, text2_path, tmp_path_factory): + path = tmp_path_factory.mktemp("data") / "dataset.text.zip" + with zipfile.ZipFile(path, "w") as f: + f.write(text_path, arcname=os.path.basename(text_path)) + f.write(text2_path, arcname=os.path.basename(text2_path)) + return path + + +@pytest.fixture(scope="session") +def zip_text_with_dir_path(text_path, text2_path, tmp_path_factory): + path = tmp_path_factory.mktemp("data") / "dataset_with_dir.text.zip" + with zipfile.ZipFile(path, "w") as f: + f.write(text_path, arcname=os.path.join("main_dir", os.path.basename(text_path))) + f.write(text2_path, arcname=os.path.join("main_dir", os.path.basename(text2_path))) + return path + + +@pytest.fixture(scope="session") +def zip_unsupported_ext_path(text_path, text2_path, tmp_path_factory): + path = tmp_path_factory.mktemp("data") / "dataset.ext.zip" + with zipfile.ZipFile(path, "w") as f: + f.write(text_path, arcname=os.path.basename("unsupported.ext")) + f.write(text2_path, arcname=os.path.basename("unsupported_2.ext")) + return path + + +@pytest.fixture(scope="session") +def text_path_with_unicode_new_lines(tmp_path_factory): + text = "\n".join(["First", "Second\u2029with Unicode new line", "Third"]) + path = str(tmp_path_factory.mktemp("data") / "dataset_with_unicode_new_lines.txt") + with open(path, "w", encoding="utf-8") as f: + f.write(text) + return path + + +@pytest.fixture(scope="session") +def image_file(): + return os.path.join("tests", "features", "data", "test_image_rgb.jpg") + + +@pytest.fixture(scope="session") +def audio_file(): + return os.path.join("tests", "features", "data", "test_audio_44100.wav") + + +@pytest.fixture(scope="session") +def audio_file_44100(): + return os.path.join("tests", "features", "data", "test_audio_44100.mp3") + + +@pytest.fixture(scope="session") +def audio_file_16000(): + return os.path.join("tests", "features", "data", "test_audio_16000.mp3") + + +@pytest.fixture(scope="session") +def tensor_file(tmp_path_factory): + import torch + + path = tmp_path_factory.mktemp("data") / "tensor.pth" + with open(path, "wb") as f: + torch.save(torch.ones(128), f) + return path + + +@pytest.fixture(scope="session") +def zip_image_path(image_file, tmp_path_factory): + path = tmp_path_factory.mktemp("data") / "dataset.img.zip" + with zipfile.ZipFile(path, "w") as f: + f.write(image_file, arcname=os.path.basename(image_file)) + f.write(image_file, arcname=os.path.basename(image_file).replace(".jpg", "2.jpg")) + return path + + +@pytest.fixture(scope="session") +def data_dir_with_hidden_files(tmp_path_factory): + data_dir = tmp_path_factory.mktemp("data_dir") + + (data_dir / "subdir").mkdir() + with open(data_dir / "subdir" / "train.txt", "w") as f: + f.write("foo\n" * 10) + with open(data_dir / "subdir" / "test.txt", "w") as f: + f.write("bar\n" * 10) + # hidden file + with open(data_dir / "subdir" / ".test.txt", "w") as f: + f.write("bar\n" * 10) + + # hidden directory + (data_dir / ".subdir").mkdir() + with open(data_dir / ".subdir" / "train.txt", "w") as f: + f.write("foo\n" * 10) + with open(data_dir / ".subdir" / "test.txt", "w") as f: + f.write("bar\n" * 10) + + return data_dir