class documentation

AbstractDataSet is the base class for all data set implementations. All data set implementations should extend this abstract class and implement the methods marked as abstract. If a specific dataset implementation cannot be used in conjunction with the ParallelRunner, such user-defined dataset should have the attribute _SINGLE_PROCESS = True. Example:

>>> from pathlib import Path, PurePosixPath
>>> import pandas as pd
>>> from kedro.io import AbstractDataSet
>>>
>>>
>>> class MyOwnDataSet(AbstractDataSet[pd.DataFrame, pd.DataFrame]):
>>>     def __init__(self, filepath, param1, param2=True):
>>>         self._filepath = PurePosixPath(filepath)
>>>         self._param1 = param1
>>>         self._param2 = param2
>>>
>>>     def _load(self) -> pd.DataFrame:
>>>         return pd.read_csv(self._filepath)
>>>
>>>     def _save(self, df: pd.DataFrame) -> None:
>>>         df.to_csv(str(self._filepath))
>>>
>>>     def _exists(self) -> bool:
>>>         return Path(self._filepath.as_posix()).exists()
>>>
>>>     def _describe(self):
>>>         return dict(param1=self._param1, param2=self._param2)

Example catalog.yml specification:

my_dataset:
    type: <path-to-my-own-dataset>.MyOwnDataSet
    filepath: data/01_raw/my_data.csv
    param1: <param1-value> # param1 is a required argument
    # param2 will be True by default
Class Method from_config Create a data set instance using the configuration provided.
Method __str__ Undocumented
Method exists Checks whether a data set's output already exists by calling the provided _exists() method.
Method load Loads data by delegation to the provided load method.
Method release Release any cached data.
Method save Saves data by delegation to the provided save method.
Method _copy Undocumented
Method _describe Undocumented
Method _exists Undocumented
Method _load Undocumented
Method _release Undocumented
Method _save Undocumented
Property _logger Undocumented
@classmethod
def from_config(cls: Type, name: str, config: Dict[str, Any], load_version: str = None, save_version: str = None) -> AbstractDataSet: (source)

Create a data set instance using the configuration provided.

Parameters
name:strData set name.
config:Dict[str, Any]Data set config dictionary.
load_version:strVersion string to be used for load operation if the data set is versioned. Has no effect on the data set if versioning was not enabled.
save_version:strVersion string to be used for save operation if the data set is versioned. Has no effect on the data set if versioning was not enabled.
Returns
AbstractDataSetAn instance of an AbstractDataSet subclass.
Raises
DataSetErrorWhen the function fails to create the data set from its config.
def __str__(self): (source)

Undocumented

def exists(self) -> bool: (source)

Checks whether a data set's output already exists by calling the provided _exists() method.

Returns
boolFlag indicating whether the output already exists.
Raises
DataSetErrorwhen underlying exists method raises error.
def load(self) -> _DO: (source)

Loads data by delegation to the provided load method.

Returns
_DOData returned by the provided load method.
Raises
DataSetErrorWhen underlying load method raises error.
def release(self): (source)

Release any cached data.

Raises
DataSetErrorwhen underlying release method raises error.
def save(self, data: _DI): (source)

Saves data by delegation to the provided save method.

Parameters
data:_DIthe value to be saved by provided save method.
Raises
DataSetErrorwhen underlying save method raises error.
FileNotFoundErrorwhen save method got file instead of dir, on Windows.
NotADirectoryErrorwhen save method got file instead of dir, on Unix.
def _copy(self, **overwrite_params) -> AbstractDataSet: (source)

Undocumented

@abc.abstractmethod
def _describe(self) -> Dict[str, Any]: (source)
overridden in kedro.extras.datasets.api.APIDataSet, kedro.extras.datasets.biosequence.BioSequenceDataSet, kedro.extras.datasets.dask.ParquetDataSet, kedro.extras.datasets.email.EmailMessageDataSet, kedro.extras.datasets.geopandas.GeoJSONDataSet, kedro.extras.datasets.holoviews.HoloviewsWriter, kedro.extras.datasets.json.JSONDataSet, kedro.extras.datasets.matplotlib.MatplotlibWriter, kedro.extras.datasets.networkx.GMLDataSet, kedro.extras.datasets.networkx.GraphMLDataSet, kedro.extras.datasets.networkx.JSONDataSet, kedro.extras.datasets.pandas.CSVDataSet, kedro.extras.datasets.pandas.ExcelDataSet, kedro.extras.datasets.pandas.FeatherDataSet, kedro.extras.datasets.pandas.GBQQueryDataSet, kedro.extras.datasets.pandas.GBQTableDataSet, kedro.extras.datasets.pandas.GenericDataSet, kedro.extras.datasets.pandas.HDFDataSet, kedro.extras.datasets.pandas.JSONDataSet, kedro.extras.datasets.pandas.ParquetDataSet, kedro.extras.datasets.pandas.sql_dataset.SQLQueryDataSet, kedro.extras.datasets.pandas.sql_dataset.SQLTableDataSet, kedro.extras.datasets.pandas.XMLDataSet, kedro.extras.datasets.pickle.PickleDataSet, kedro.extras.datasets.pillow.ImageDataSet, kedro.extras.datasets.plotly.JSONDataSet, kedro.extras.datasets.redis.PickleDataSet, kedro.extras.datasets.spark.DeltaTableDataSet, kedro.extras.datasets.spark.spark_jdbc_dataset.SparkJDBCDataSet, kedro.extras.datasets.spark.SparkDataSet, kedro.extras.datasets.spark.SparkHiveDataSet, kedro.extras.datasets.svmlight.SVMLightDataSet, kedro.extras.datasets.tensorflow.TensorFlowModelDataset, kedro.extras.datasets.text.TextDataSet, kedro.extras.datasets.video.VideoDataSet, kedro.extras.datasets.yaml.YAMLDataSet, kedro.io.CachedDataSet, kedro.io.LambdaDataSet, kedro.io.MemoryDataSet, kedro.io.PartitionedDataSet

Undocumented

def _exists(self) -> bool: (source)
overridden in kedro.extras.datasets.api.APIDataSet, kedro.extras.datasets.biosequence.BioSequenceDataSet, kedro.extras.datasets.dask.ParquetDataSet, kedro.extras.datasets.email.EmailMessageDataSet, kedro.extras.datasets.geopandas.GeoJSONDataSet, kedro.extras.datasets.holoviews.HoloviewsWriter, kedro.extras.datasets.json.JSONDataSet, kedro.extras.datasets.matplotlib.MatplotlibWriter, kedro.extras.datasets.networkx.GMLDataSet, kedro.extras.datasets.networkx.GraphMLDataSet, kedro.extras.datasets.networkx.JSONDataSet, kedro.extras.datasets.pandas.CSVDataSet, kedro.extras.datasets.pandas.ExcelDataSet, kedro.extras.datasets.pandas.FeatherDataSet, kedro.extras.datasets.pandas.GBQTableDataSet, kedro.extras.datasets.pandas.GenericDataSet, kedro.extras.datasets.pandas.HDFDataSet, kedro.extras.datasets.pandas.JSONDataSet, kedro.extras.datasets.pandas.ParquetDataSet, kedro.extras.datasets.pandas.sql_dataset.SQLTableDataSet, kedro.extras.datasets.pandas.XMLDataSet, kedro.extras.datasets.pickle.PickleDataSet, kedro.extras.datasets.pillow.ImageDataSet, kedro.extras.datasets.plotly.JSONDataSet, kedro.extras.datasets.redis.PickleDataSet, kedro.extras.datasets.spark.DeltaTableDataSet, kedro.extras.datasets.spark.SparkDataSet, kedro.extras.datasets.spark.SparkHiveDataSet, kedro.extras.datasets.svmlight.SVMLightDataSet, kedro.extras.datasets.tensorflow.TensorFlowModelDataset, kedro.extras.datasets.text.TextDataSet, kedro.extras.datasets.video.VideoDataSet, kedro.extras.datasets.yaml.YAMLDataSet, kedro.io.CachedDataSet, kedro.io.LambdaDataSet, kedro.io.MemoryDataSet, kedro.io.PartitionedDataSet

Undocumented

@abc.abstractmethod
def _load(self) -> _DO: (source)
overridden in kedro.extras.datasets.api.APIDataSet, kedro.extras.datasets.biosequence.BioSequenceDataSet, kedro.extras.datasets.dask.ParquetDataSet, kedro.extras.datasets.email.EmailMessageDataSet, kedro.extras.datasets.geopandas.GeoJSONDataSet, kedro.extras.datasets.holoviews.HoloviewsWriter, kedro.extras.datasets.json.JSONDataSet, kedro.extras.datasets.matplotlib.MatplotlibWriter, kedro.extras.datasets.networkx.GMLDataSet, kedro.extras.datasets.networkx.GraphMLDataSet, kedro.extras.datasets.networkx.JSONDataSet, kedro.extras.datasets.pandas.CSVDataSet, kedro.extras.datasets.pandas.ExcelDataSet, kedro.extras.datasets.pandas.FeatherDataSet, kedro.extras.datasets.pandas.GBQQueryDataSet, kedro.extras.datasets.pandas.GBQTableDataSet, kedro.extras.datasets.pandas.GenericDataSet, kedro.extras.datasets.pandas.HDFDataSet, kedro.extras.datasets.pandas.JSONDataSet, kedro.extras.datasets.pandas.ParquetDataSet, kedro.extras.datasets.pandas.sql_dataset.SQLQueryDataSet, kedro.extras.datasets.pandas.sql_dataset.SQLTableDataSet, kedro.extras.datasets.pandas.XMLDataSet, kedro.extras.datasets.pickle.PickleDataSet, kedro.extras.datasets.pillow.ImageDataSet, kedro.extras.datasets.plotly.JSONDataSet, kedro.extras.datasets.redis.PickleDataSet, kedro.extras.datasets.spark.DeltaTableDataSet, kedro.extras.datasets.spark.spark_jdbc_dataset.SparkJDBCDataSet, kedro.extras.datasets.spark.SparkDataSet, kedro.extras.datasets.spark.SparkHiveDataSet, kedro.extras.datasets.svmlight.SVMLightDataSet, kedro.extras.datasets.tensorflow.TensorFlowModelDataset, kedro.extras.datasets.text.TextDataSet, kedro.extras.datasets.video.VideoDataSet, kedro.extras.datasets.yaml.YAMLDataSet, kedro.io.CachedDataSet, kedro.io.LambdaDataSet, kedro.io.MemoryDataSet, kedro.io.PartitionedDataSet

Undocumented

@abc.abstractmethod
def _save(self, data: _DI): (source)
overridden in kedro.extras.datasets.api.APIDataSet, kedro.extras.datasets.biosequence.BioSequenceDataSet, kedro.extras.datasets.dask.ParquetDataSet, kedro.extras.datasets.email.EmailMessageDataSet, kedro.extras.datasets.geopandas.GeoJSONDataSet, kedro.extras.datasets.holoviews.HoloviewsWriter, kedro.extras.datasets.json.JSONDataSet, kedro.extras.datasets.matplotlib.MatplotlibWriter, kedro.extras.datasets.networkx.GMLDataSet, kedro.extras.datasets.networkx.GraphMLDataSet, kedro.extras.datasets.networkx.JSONDataSet, kedro.extras.datasets.pandas.CSVDataSet, kedro.extras.datasets.pandas.ExcelDataSet, kedro.extras.datasets.pandas.FeatherDataSet, kedro.extras.datasets.pandas.GBQQueryDataSet, kedro.extras.datasets.pandas.GBQTableDataSet, kedro.extras.datasets.pandas.GenericDataSet, kedro.extras.datasets.pandas.HDFDataSet, kedro.extras.datasets.pandas.JSONDataSet, kedro.extras.datasets.pandas.ParquetDataSet, kedro.extras.datasets.pandas.sql_dataset.SQLQueryDataSet, kedro.extras.datasets.pandas.sql_dataset.SQLTableDataSet, kedro.extras.datasets.pandas.XMLDataSet, kedro.extras.datasets.pickle.PickleDataSet, kedro.extras.datasets.pillow.ImageDataSet, kedro.extras.datasets.plotly.JSONDataSet, kedro.extras.datasets.redis.PickleDataSet, kedro.extras.datasets.spark.DeltaTableDataSet, kedro.extras.datasets.spark.spark_jdbc_dataset.SparkJDBCDataSet, kedro.extras.datasets.spark.SparkDataSet, kedro.extras.datasets.spark.SparkHiveDataSet, kedro.extras.datasets.svmlight.SVMLightDataSet, kedro.extras.datasets.tensorflow.TensorFlowModelDataset, kedro.extras.datasets.text.TextDataSet, kedro.extras.datasets.video.VideoDataSet, kedro.extras.datasets.yaml.YAMLDataSet, kedro.io.CachedDataSet, kedro.io.LambdaDataSet, kedro.io.MemoryDataSet, kedro.io.PartitionedDataSet

Undocumented