I am trying to fine tune a LLM
My code so far:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
AutoTokenizer,
AutoConfig,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
TrainingArguments,
Trainer)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
# load dataset
dataset = load_dataset('TokenBender/code_instructions_122k_alpaca_style')
dataset
Error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In [12], line 2
1 # load dataset
----> 2 dataset = load_dataset('TokenBender/code_instructions_122k_alpaca_style')
3 dataset
File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1664, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, **config_kwargs)
1661 ignore_verifications = ignore_verifications or save_infos
1663 # Create a dataset builder
-> 1664 builder_instance = load_dataset_builder(
1665 path=path,
1666 name=name,
1667 data_dir=data_dir,
1668 data_files=data_files,
1669 cache_dir=cache_dir,
1670 features=features,
1671 download_config=download_config,
1672 download_mode=download_mode,
1673 revision=revision,
1674 use_auth_token=use_auth_token,
1675 **config_kwargs,
1676 )
1678 # Return iterable dataset in case of streaming
1679 if streaming:
File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1490, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, **config_kwargs)
1488 download_config = download_config.copy() if download_config else DownloadConfig()
1489 download_config.use_auth_token = use_auth_token
-> 1490 dataset_module = dataset_module_factory(
1491 path,
1492 revision=revision,
1493 download_config=download_config,
1494 download_mode=download_mode,
1495 data_dir=data_dir,
1496 data_files=data_files,
1497 )
1499 # Get dataset builder class from the processing script
1500 builder_cls = import_main_class(dataset_module.module_path)
File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1242, in dataset_module_factory(path, revision, download_config, download_mode, force_local_path, dynamic_modules_path, data_dir, data_files, **download_kwargs)
1237 if isinstance(e1, FileNotFoundError):
1238 raise FileNotFoundError(
1239 f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
1240 f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
1241 ) from None
-> 1242 raise e1 from None
1243 else:
1244 raise FileNotFoundError(
1245 f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory."
1246 )
File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1223, in dataset_module_factory(path, revision, download_config, download_mode, force_local_path, dynamic_modules_path, data_dir, data_files, **download_kwargs)
1215 return HubDatasetModuleFactoryWithScript(
1216 path,
1217 revision=revision,
(...)
1220 dynamic_modules_path=dynamic_modules_path,
1221 ).get_module()
1222 else:
-> 1223 return HubDatasetModuleFactoryWithoutScript(
1224 path,
1225 revision=revision,
1226 data_dir=data_dir,
1227 data_files=data_files,
1228 download_config=download_config,
1229 download_mode=download_mode,
1230 ).get_module()
1231 except Exception as e1: # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
1232 try:
File /usr/local/lib/python3.9/dist-packages/datasets/load.py:846, in HubDatasetModuleFactoryWithoutScript.get_module(self)
836 token = self.download_config.use_auth_token
837 hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
838 self.name,
839 revision=self.revision,
840 token=token,
841 timeout=100.0,
842 )
843 patterns = (
844 sanitize_patterns(self.data_files)
845 if self.data_files is not None
--> 846 else get_patterns_in_dataset_repository(hfh_dataset_info)
847 )
848 data_files = DataFilesDict.from_hf_repo(
849 patterns,
850 dataset_info=hfh_dataset_info,
851 allowed_extensions=ALL_ALLOWED_EXTENSIONS,
852 )
853 infered_module_names = {
854 key: infer_module_for_data_files(data_files_list, use_auth_token=self.download_config.use_auth_token)
855 for key, data_files_list in data_files.items()
856 }
File /usr/local/lib/python3.9/dist-packages/datasets/data_files.py:471, in get_patterns_in_dataset_repository(dataset_info)
469 resolver = partial(_resolve_single_pattern_in_dataset_repository, dataset_info)
470 try:
--> 471 return _get_data_files_patterns(resolver)
472 except FileNotFoundError:
473 raise FileNotFoundError(
474 f"The dataset repository at '{dataset_info.id}' doesn't contain any data file."
475 ) from None
File /usr/local/lib/python3.9/dist-packages/datasets/data_files.py:99, in _get_data_files_patterns(pattern_resolver)
97 try:
98 for pattern in patterns:
---> 99 data_files = pattern_resolver(pattern)
100 if len(data_files) > 0:
101 non_empty_splits.append(split)
File /usr/local/lib/python3.9/dist-packages/datasets/data_files.py:303, in _resolve_single_pattern_in_dataset_repository(dataset_info, pattern, allowed_extensions)
301 data_files_ignore = FILES_TO_IGNORE
302 fs = HfFileSystem(repo_info=dataset_info)
--> 303 glob_iter = [PurePath(filepath) for filepath in fs.glob(PurePath(pattern).as_posix()) if fs.isfile(filepath)]
304 matched_paths = [
305 filepath
306 for filepath in glob_iter
307 if filepath.name not in data_files_ignore and not filepath.name.startswith(".")
308 ]
309 if allowed_extensions is not None:
File /usr/local/lib/python3.9/dist-packages/fsspec/spec.py:606, in AbstractFileSystem.glob(self, path, maxdepth, **kwargs)
602 depth = None
604 allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
--> 606 pattern = glob_translate(path + ("/" if ends_with_sep else ""))
607 pattern = re.compile(pattern)
609 out = {
610 p: info
611 for p, info in sorted(allpaths.items())
(...)
618 )
619 }
File /usr/local/lib/python3.9/dist-packages/fsspec/utils.py:734, in glob_translate(pat)
732 continue
733 elif "**" in part:
--> 734 raise ValueError(
735 "Invalid pattern: '**' can only be an entire path component"
736 )
737 if part:
738 results.extend(_translate(part, f"{not_sep}*", not_sep))
ValueError: Invalid pattern: '**' can only be an entire path component
I tried to find something online the closet I found is this article https://github.com/coala/coala/issues/401
but I could not understand their solution. Can anyone help me in understanding the solution for the error I am facing. Thanks.
My library versions:
The error is likely due to a change in datasets package (somewhere between 2.1 to 2.14) is breaking fsspec. It has been fixed (see discussion in issues) in the latest datasets release (2.15.0).
Update your installation with pip install -U datasets to fix the fsspec ValueError.
The solution works for datasets version 2.10.1 on Python 3.10, as it should update the package with a hotfix that was added for version > 2.15.0.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With