ValueError: Invalid pattern: '**' can only be an entire path component

Question

I am trying to fine tune a LLM

My code so far:

from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
        Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

# load dataset
dataset = load_dataset('TokenBender/code_instructions_122k_alpaca_style')
dataset

Error:

    ---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In [12], line 2
      1 # load dataset
----> 2 dataset = load_dataset('TokenBender/code_instructions_122k_alpaca_style')
      3 dataset

File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1664, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, **config_kwargs)
   1661 ignore_verifications = ignore_verifications or save_infos
   1663 # Create a dataset builder
-> 1664 builder_instance = load_dataset_builder(
   1665     path=path,
   1666     name=name,
   1667     data_dir=data_dir,
   1668     data_files=data_files,
   1669     cache_dir=cache_dir,
   1670     features=features,
   1671     download_config=download_config,
   1672     download_mode=download_mode,
   1673     revision=revision,
   1674     use_auth_token=use_auth_token,
   1675     **config_kwargs,
   1676 )
   1678 # Return iterable dataset in case of streaming
   1679 if streaming:

File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1490, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, **config_kwargs)
   1488     download_config = download_config.copy() if download_config else DownloadConfig()
   1489     download_config.use_auth_token = use_auth_token
-> 1490 dataset_module = dataset_module_factory(
   1491     path,
   1492     revision=revision,
   1493     download_config=download_config,
   1494     download_mode=download_mode,
   1495     data_dir=data_dir,
   1496     data_files=data_files,
   1497 )
   1499 # Get dataset builder class from the processing script
   1500 builder_cls = import_main_class(dataset_module.module_path)

File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1242, in dataset_module_factory(path, revision, download_config, download_mode, force_local_path, dynamic_modules_path, data_dir, data_files, **download_kwargs)
   1237             if isinstance(e1, FileNotFoundError):
   1238                 raise FileNotFoundError(
   1239                     f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
   1240                     f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
   1241                 ) from None
-> 1242             raise e1 from None
   1243 else:
   1244     raise FileNotFoundError(
   1245         f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory."
   1246     )

File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1223, in dataset_module_factory(path, revision, download_config, download_mode, force_local_path, dynamic_modules_path, data_dir, data_files, **download_kwargs)
   1215             return HubDatasetModuleFactoryWithScript(
   1216                 path,
   1217                 revision=revision,
   (...)
   1220                 dynamic_modules_path=dynamic_modules_path,
   1221             ).get_module()
   1222         else:
-> 1223             return HubDatasetModuleFactoryWithoutScript(
   1224                 path,
   1225                 revision=revision,
   1226                 data_dir=data_dir,
   1227                 data_files=data_files,
   1228                 download_config=download_config,
   1229                 download_mode=download_mode,
   1230             ).get_module()
   1231 except Exception as e1:  # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
   1232     try:

File /usr/local/lib/python3.9/dist-packages/datasets/load.py:846, in HubDatasetModuleFactoryWithoutScript.get_module(self)
    836     token = self.download_config.use_auth_token
    837 hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
    838     self.name,
    839     revision=self.revision,
    840     token=token,
    841     timeout=100.0,
    842 )
    843 patterns = (
    844     sanitize_patterns(self.data_files)
    845     if self.data_files is not None
--> 846     else get_patterns_in_dataset_repository(hfh_dataset_info)
    847 )
    848 data_files = DataFilesDict.from_hf_repo(
    849     patterns,
    850     dataset_info=hfh_dataset_info,
    851     allowed_extensions=ALL_ALLOWED_EXTENSIONS,
    852 )
    853 infered_module_names = {
    854     key: infer_module_for_data_files(data_files_list, use_auth_token=self.download_config.use_auth_token)
    855     for key, data_files_list in data_files.items()
    856 }

File /usr/local/lib/python3.9/dist-packages/datasets/data_files.py:471, in get_patterns_in_dataset_repository(dataset_info)
    469 resolver = partial(_resolve_single_pattern_in_dataset_repository, dataset_info)
    470 try:
--> 471     return _get_data_files_patterns(resolver)
    472 except FileNotFoundError:
    473     raise FileNotFoundError(
    474         f"The dataset repository at '{dataset_info.id}' doesn't contain any data file."
    475     ) from None

File /usr/local/lib/python3.9/dist-packages/datasets/data_files.py:99, in _get_data_files_patterns(pattern_resolver)
     97 try:
     98     for pattern in patterns:
---> 99         data_files = pattern_resolver(pattern)
    100         if len(data_files) > 0:
    101             non_empty_splits.append(split)

File /usr/local/lib/python3.9/dist-packages/datasets/data_files.py:303, in _resolve_single_pattern_in_dataset_repository(dataset_info, pattern, allowed_extensions)
    301 data_files_ignore = FILES_TO_IGNORE
    302 fs = HfFileSystem(repo_info=dataset_info)
--> 303 glob_iter = [PurePath(filepath) for filepath in fs.glob(PurePath(pattern).as_posix()) if fs.isfile(filepath)]
    304 matched_paths = [
    305     filepath
    306     for filepath in glob_iter
    307     if filepath.name not in data_files_ignore and not filepath.name.startswith(".")
    308 ]
    309 if allowed_extensions is not None:

File /usr/local/lib/python3.9/dist-packages/fsspec/spec.py:606, in AbstractFileSystem.glob(self, path, maxdepth, **kwargs)
    602         depth = None
    604 allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
--> 606 pattern = glob_translate(path + ("/" if ends_with_sep else ""))
    607 pattern = re.compile(pattern)
    609 out = {
    610     p: info
    611     for p, info in sorted(allpaths.items())
   (...)
    618     )
    619 }

File /usr/local/lib/python3.9/dist-packages/fsspec/utils.py:734, in glob_translate(pat)
    732     continue
    733 elif "**" in part:
--> 734     raise ValueError(
    735         "Invalid pattern: '**' can only be an entire path component"
    736     )
    737 if part:
    738     results.extend(_translate(part, f"{not_sep}*", not_sep))

ValueError: Invalid pattern: '**' can only be an entire path component

I tried to find something online the closet I found is this article https://github.com/coala/coala/issues/401

but I could not understand their solution. Can anyone help me in understanding the solution for the error I am facing. Thanks.

My library versions:

peft : '0.6.0'
torch : '2.1.2+cu121'
datasets : '2.1.0'
transformers : '4.21.3'

codefun · Accepted Answer

The error is likely due to a change in datasets package (somewhere between 2.1 to 2.14) is breaking fsspec. It has been fixed (see discussion in issues) in the latest datasets release (2.15.0).

Update your installation with pip install -U datasets to fix the fsspec ValueError.

The solution works for datasets version 2.10.1 on Python 3.10, as it should update the package with a hotfix that was added for version > 2.15.0.

ValueError: Invalid pattern: '**' can only be an entire path component

Tags:

python

large-language-model

huggingface-datasets

Hitesh Somani

1 Answers

codefun

Recent Activity

Donate For Us

ValueError: Invalid pattern: '**' can only be an entire path component

Tags:

python

large-language-model

huggingface-datasets

Hitesh Somani

1 Answers

codefun

Related questions

Recent Activity

Donate For Us