Skip to content

Catalog

smallcat.catalog.Catalog

Bases: BaseModel

A collection of named datasets with associated loader configuration.

The catalog maps user-defined keys to concrete dataset entries (e.g., CSV or Excel). It can be constructed from an in-memory dictionary, an Airflow Variable (JSON), or a YAML file.

Attributes:

Name Type Description
entries dict[str, CatalogEntry]

Mapping of dataset names to their configurations.

Source code in src/smallcat/catalog.py
class Catalog(BaseModel):
    """A collection of named datasets with associated loader configuration.

    The catalog maps user-defined keys to concrete dataset entries (e.g., CSV or
    Excel). It can be constructed from an in-memory dictionary, an Airflow
    Variable (JSON), or a YAML file.

    Attributes:
        entries: Mapping of dataset names to their configurations.
    """

    entries: dict[str, CatalogEntry] = Field(
        ...,
        description="Named data sets",
    )

    @staticmethod
    def from_dict(dictionary: dict) -> "Catalog":
        """Create a catalog from a Python dictionary.

        The dictionary must conform to the `Catalog` schema (i.e., include an
        `entries` key mapping names to valid `CatalogEntry` objects).

        Args:
            dictionary: A dictionary matching the `Catalog` model.

        Returns:
            Catalog: A validated `Catalog` instance.

        Raises:
            pydantic.ValidationError: If the dictionary does not match the schema.
        """
        return Catalog.model_validate(dictionary)

    @staticmethod
    def from_airflow_variable(variable_id: str) -> "Catalog":
        """Create a catalog from an Airflow Variable containing JSON.

        The Airflow Variable should contain a JSON object compatible with the
        `Catalog` schema.

        Args:
            variable_id: The Airflow Variable ID to read (expects JSON).

        Returns:
            Catalog: A `Catalog` instance constructed from the Variable value.

        Raises:
            KeyError: If the Airflow Variable does not exist.
            pydantic.ValidationError: If the JSON payload is invalid for the model.
        """
        try:
            from airflow.sdk import Variable
        except ImportError:
            from airflow.models import Variable  # type: ignore[attr-defined,no-redef] # noqa: I001

        try:
            dictionary_entries = Variable.get(variable_id, deserialize_json=True)
        except TypeError:
            # LocalFilesystemBackend can return an object causing a TypeError
            # In this case we don't need to deserialize into JSON
            #  as it's not a string
            dictionary_entries = Variable.get(variable_id)
        except ImportError as e:
            # Airflow fails with import error if variable is not present and tries
            #  to talk to the Task Supervisor (the runner process) over an internal
            #  comms channel (SUPERVISOR_COMMS) to fetch it.
            msg = f"Variable {variable_id} not found in Airflow"
            raise KeyError(msg) from e
        return Catalog.from_dict(dictionary_entries)

    @staticmethod
    def from_yaml(dictionary_path: str | Path) -> "Catalog":
        """Create a catalog from a YAML file.

        Args:
            dictionary_path: Path to a YAML file whose content matches the
                `Catalog` schema.

        Returns:
            Catalog: A `Catalog` instance constructed from the YAML content.

        Raises:
            FileNotFoundError: If the YAML file cannot be found.
            pydantic.ValidationError: If the YAML content is invalid for the model.
        """
        with Path(dictionary_path).open() as f:
            catalog_dict = yaml.safe_load(f)
        return Catalog.from_dict(catalog_dict)

    def _get_entry(self, key: str) -> EntryBase:
        try:
            return self.entries[key]
        except KeyError as e:
            msg = f"Entry {key} not found in dictionary"
            raise KeyError(msg) from e

    def get_dataset(self, key: str) -> BaseDataset:
        """Instantiate a concrete dataset for a given catalog entry.

        Args:
            key: The name of the catalog entry to resolve.

        Returns:
            BaseDataset: A dataset instance ready to load/save the data.

        Raises:
            KeyError: If the key is not present in the catalog.
            ValueError: If the entry's `file_format` is not supported.
        """
        entry = self._get_entry(key)
        return entry.build_dataset()

    def load_pandas(self, key: str, where: str | None = None) -> "pd.DataFrame":
        """Load a dataset from the catalog into a pandas DataFrame.

        Resolves the catalog entry identified by ``key`` and delegates to
        :meth:`EntryBase.load_pandas`. This is equivalent to:

            ``self.entries[key].build_dataset().load_pandas(entry.location)``

        Args:
            key: The catalog entry name to load.
            where: Optional SQL filter predicate forwarded to the dataset.

        Returns:
            pd.DataFrame: The loaded tabular data.

        Raises:
            KeyError: If ``key`` is not present in the catalog.
            Exception: Any error propagated from the underlying dataset's loader.
        """
        entry = self._get_entry(key)
        return entry.load_pandas(where=where)

    def save_pandas(self, key: str, df: "pd.DataFrame") -> None:
        """Save a pandas DataFrame to a dataset in the catalog.

        Resolves the catalog entry identified by ``key`` and delegates to
        :meth:`EntryBase.save_pandas`. This writes to the entry's configured
        ``location`` with any format-specific save options applied.

        Args:
            key: The catalog entry name to write to.
            df (pd.DataFrame): The DataFrame to persist.

        Raises:
            KeyError: If ``key`` is not present in the catalog.
            Exception: Any error propagated from the underlying dataset's saver.
        """
        entry = self._get_entry(key)
        entry.save_pandas(df)

    def load_arrow(self, key: str, where: str | None = None) -> "pa.Table":
        """Load a dataset from the catalog into an Apache Arrow Table.

        Resolves the catalog entry identified by `key` and delegates to
        :meth:`EntryBase.load_arrow`. This is equivalent to:

            `self.entries[key].build_dataset().load_arrow_table(entry.location)`

        Args:
            key: The catalog entry name to load.
            where: Optional SQL filter predicate forwarded to the dataset.

        Returns:
            pa.Table: The loaded Arrow table.

        Raises:
            KeyError: If `key` is not present in the catalog.
            Exception: Any error propagated from the underlying dataset's loader.
        """
        entry = self._get_entry(key)
        return entry.load_arrow(where=where)

    def save_arrow(self, key: str, table: "pa.Table") -> None:
        """Save an Apache Arrow Table to a dataset in the catalog.

        Resolves the catalog entry identified by `key` and delegates to
        :meth:`EntryBase.save_arrow`. This writes to the entry's configured
        `location` with any format-specific save options applied.

        Args:
            key: The catalog entry name to write to.
            table (pa.Table): The Arrow table to persist.

        Raises:
            KeyError: If `key` is not present in the catalog.
            Exception: Any error propagated from the underlying dataset's saver.
        """
        entry = self._get_entry(key)
        entry.save_arrow(table)

entries class-attribute instance-attribute

entries: dict[str, CatalogEntry] = Field(..., description='Named data sets')

from_dict staticmethod

from_dict(dictionary: dict) -> Catalog

Create a catalog from a Python dictionary.

The dictionary must conform to the Catalog schema (i.e., include an entries key mapping names to valid CatalogEntry objects).

Parameters:

Name Type Description Default
dictionary dict

A dictionary matching the Catalog model.

required

Returns:

Name Type Description
Catalog Catalog

A validated Catalog instance.

Raises:

Type Description
pydantic.ValidationError

If the dictionary does not match the schema.

Source code in src/smallcat/catalog.py
@staticmethod
def from_dict(dictionary: dict) -> "Catalog":
    """Create a catalog from a Python dictionary.

    The dictionary must conform to the `Catalog` schema (i.e., include an
    `entries` key mapping names to valid `CatalogEntry` objects).

    Args:
        dictionary: A dictionary matching the `Catalog` model.

    Returns:
        Catalog: A validated `Catalog` instance.

    Raises:
        pydantic.ValidationError: If the dictionary does not match the schema.
    """
    return Catalog.model_validate(dictionary)

from_airflow_variable staticmethod

from_airflow_variable(variable_id: str) -> Catalog

Create a catalog from an Airflow Variable containing JSON.

The Airflow Variable should contain a JSON object compatible with the Catalog schema.

Parameters:

Name Type Description Default
variable_id str

The Airflow Variable ID to read (expects JSON).

required

Returns:

Name Type Description
Catalog Catalog

A Catalog instance constructed from the Variable value.

Raises:

Type Description
KeyError

If the Airflow Variable does not exist.

pydantic.ValidationError

If the JSON payload is invalid for the model.

Source code in src/smallcat/catalog.py
@staticmethod
def from_airflow_variable(variable_id: str) -> "Catalog":
    """Create a catalog from an Airflow Variable containing JSON.

    The Airflow Variable should contain a JSON object compatible with the
    `Catalog` schema.

    Args:
        variable_id: The Airflow Variable ID to read (expects JSON).

    Returns:
        Catalog: A `Catalog` instance constructed from the Variable value.

    Raises:
        KeyError: If the Airflow Variable does not exist.
        pydantic.ValidationError: If the JSON payload is invalid for the model.
    """
    try:
        from airflow.sdk import Variable
    except ImportError:
        from airflow.models import Variable  # type: ignore[attr-defined,no-redef] # noqa: I001

    try:
        dictionary_entries = Variable.get(variable_id, deserialize_json=True)
    except TypeError:
        # LocalFilesystemBackend can return an object causing a TypeError
        # In this case we don't need to deserialize into JSON
        #  as it's not a string
        dictionary_entries = Variable.get(variable_id)
    except ImportError as e:
        # Airflow fails with import error if variable is not present and tries
        #  to talk to the Task Supervisor (the runner process) over an internal
        #  comms channel (SUPERVISOR_COMMS) to fetch it.
        msg = f"Variable {variable_id} not found in Airflow"
        raise KeyError(msg) from e
    return Catalog.from_dict(dictionary_entries)

from_yaml staticmethod

from_yaml(dictionary_path: str | Path) -> Catalog

Create a catalog from a YAML file.

Parameters:

Name Type Description Default
dictionary_path str | Path

Path to a YAML file whose content matches the Catalog schema.

required

Returns:

Name Type Description
Catalog Catalog

A Catalog instance constructed from the YAML content.

Raises:

Type Description
FileNotFoundError

If the YAML file cannot be found.

pydantic.ValidationError

If the YAML content is invalid for the model.

Source code in src/smallcat/catalog.py
@staticmethod
def from_yaml(dictionary_path: str | Path) -> "Catalog":
    """Create a catalog from a YAML file.

    Args:
        dictionary_path: Path to a YAML file whose content matches the
            `Catalog` schema.

    Returns:
        Catalog: A `Catalog` instance constructed from the YAML content.

    Raises:
        FileNotFoundError: If the YAML file cannot be found.
        pydantic.ValidationError: If the YAML content is invalid for the model.
    """
    with Path(dictionary_path).open() as f:
        catalog_dict = yaml.safe_load(f)
    return Catalog.from_dict(catalog_dict)

_get_entry

_get_entry(key: str) -> EntryBase
Source code in src/smallcat/catalog.py
def _get_entry(self, key: str) -> EntryBase:
    try:
        return self.entries[key]
    except KeyError as e:
        msg = f"Entry {key} not found in dictionary"
        raise KeyError(msg) from e

get_dataset

get_dataset(key: str) -> BaseDataset

Instantiate a concrete dataset for a given catalog entry.

Parameters:

Name Type Description Default
key str

The name of the catalog entry to resolve.

required

Returns:

Name Type Description
BaseDataset BaseDataset

A dataset instance ready to load/save the data.

Raises:

Type Description
KeyError

If the key is not present in the catalog.

ValueError

If the entry's file_format is not supported.

Source code in src/smallcat/catalog.py
def get_dataset(self, key: str) -> BaseDataset:
    """Instantiate a concrete dataset for a given catalog entry.

    Args:
        key: The name of the catalog entry to resolve.

    Returns:
        BaseDataset: A dataset instance ready to load/save the data.

    Raises:
        KeyError: If the key is not present in the catalog.
        ValueError: If the entry's `file_format` is not supported.
    """
    entry = self._get_entry(key)
    return entry.build_dataset()

load_pandas

load_pandas(key: str, where: str | None = None) -> pd.DataFrame

Load a dataset from the catalog into a pandas DataFrame.

Resolves the catalog entry identified by key and delegates to :meth:EntryBase.load_pandas. This is equivalent to:

``self.entries[key].build_dataset().load_pandas(entry.location)``

Parameters:

Name Type Description Default
key str

The catalog entry name to load.

required
where str | None

Optional SQL filter predicate forwarded to the dataset.

None

Returns:

Type Description
pd.DataFrame

pd.DataFrame: The loaded tabular data.

Raises:

Type Description
KeyError

If key is not present in the catalog.

Exception

Any error propagated from the underlying dataset's loader.

Source code in src/smallcat/catalog.py
def load_pandas(self, key: str, where: str | None = None) -> "pd.DataFrame":
    """Load a dataset from the catalog into a pandas DataFrame.

    Resolves the catalog entry identified by ``key`` and delegates to
    :meth:`EntryBase.load_pandas`. This is equivalent to:

        ``self.entries[key].build_dataset().load_pandas(entry.location)``

    Args:
        key: The catalog entry name to load.
        where: Optional SQL filter predicate forwarded to the dataset.

    Returns:
        pd.DataFrame: The loaded tabular data.

    Raises:
        KeyError: If ``key`` is not present in the catalog.
        Exception: Any error propagated from the underlying dataset's loader.
    """
    entry = self._get_entry(key)
    return entry.load_pandas(where=where)

save_pandas

save_pandas(key: str, df: pd.DataFrame) -> None

Save a pandas DataFrame to a dataset in the catalog.

Resolves the catalog entry identified by key and delegates to :meth:EntryBase.save_pandas. This writes to the entry's configured location with any format-specific save options applied.

Parameters:

Name Type Description Default
key str

The catalog entry name to write to.

required
df pd.DataFrame

The DataFrame to persist.

required

Raises:

Type Description
KeyError

If key is not present in the catalog.

Exception

Any error propagated from the underlying dataset's saver.

Source code in src/smallcat/catalog.py
def save_pandas(self, key: str, df: "pd.DataFrame") -> None:
    """Save a pandas DataFrame to a dataset in the catalog.

    Resolves the catalog entry identified by ``key`` and delegates to
    :meth:`EntryBase.save_pandas`. This writes to the entry's configured
    ``location`` with any format-specific save options applied.

    Args:
        key: The catalog entry name to write to.
        df (pd.DataFrame): The DataFrame to persist.

    Raises:
        KeyError: If ``key`` is not present in the catalog.
        Exception: Any error propagated from the underlying dataset's saver.
    """
    entry = self._get_entry(key)
    entry.save_pandas(df)

load_arrow

load_arrow(key: str, where: str | None = None) -> pa.Table

Load a dataset from the catalog into an Apache Arrow Table.

Resolves the catalog entry identified by key and delegates to :meth:EntryBase.load_arrow. This is equivalent to:

`self.entries[key].build_dataset().load_arrow_table(entry.location)`

Parameters:

Name Type Description Default
key str

The catalog entry name to load.

required
where str | None

Optional SQL filter predicate forwarded to the dataset.

None

Returns:

Type Description
pa.Table

pa.Table: The loaded Arrow table.

Raises:

Type Description
KeyError

If key is not present in the catalog.

Exception

Any error propagated from the underlying dataset's loader.

Source code in src/smallcat/catalog.py
def load_arrow(self, key: str, where: str | None = None) -> "pa.Table":
    """Load a dataset from the catalog into an Apache Arrow Table.

    Resolves the catalog entry identified by `key` and delegates to
    :meth:`EntryBase.load_arrow`. This is equivalent to:

        `self.entries[key].build_dataset().load_arrow_table(entry.location)`

    Args:
        key: The catalog entry name to load.
        where: Optional SQL filter predicate forwarded to the dataset.

    Returns:
        pa.Table: The loaded Arrow table.

    Raises:
        KeyError: If `key` is not present in the catalog.
        Exception: Any error propagated from the underlying dataset's loader.
    """
    entry = self._get_entry(key)
    return entry.load_arrow(where=where)

save_arrow

save_arrow(key: str, table: pa.Table) -> None

Save an Apache Arrow Table to a dataset in the catalog.

Resolves the catalog entry identified by key and delegates to :meth:EntryBase.save_arrow. This writes to the entry's configured location with any format-specific save options applied.

Parameters:

Name Type Description Default
key str

The catalog entry name to write to.

required
table pa.Table

The Arrow table to persist.

required

Raises:

Type Description
KeyError

If key is not present in the catalog.

Exception

Any error propagated from the underlying dataset's saver.

Source code in src/smallcat/catalog.py
def save_arrow(self, key: str, table: "pa.Table") -> None:
    """Save an Apache Arrow Table to a dataset in the catalog.

    Resolves the catalog entry identified by `key` and delegates to
    :meth:`EntryBase.save_arrow`. This writes to the entry's configured
    `location` with any format-specific save options applied.

    Args:
        key: The catalog entry name to write to.
        table (pa.Table): The Arrow table to persist.

    Raises:
        KeyError: If `key` is not present in the catalog.
        Exception: Any error propagated from the underlying dataset's saver.
    """
    entry = self._get_entry(key)
    entry.save_arrow(table)

Entries

smallcat.catalog.CSVEntry

Bases: EntryBase

Catalog entry describing a CSV dataset.

Attributes:

Name Type Description
file_format Literal['csv']

Literal string identifying the file format: 'csv'.

load_options CSVLoadOptions | None

Options controlling CSV reading (see CSVLoadOptions).

save_options CSVSaveOptions | None

Options controlling CSV writing (see CSVSaveOptions).

Source code in src/smallcat/catalog.py
class CSVEntry(EntryBase):
    """Catalog entry describing a CSV dataset.

    Attributes:
        file_format: Literal string identifying the file format: `'csv'`.
        load_options: Options controlling CSV *reading* (see `CSVLoadOptions`).
        save_options: Options controlling CSV *writing* (see `CSVSaveOptions`).
    """

    file_format: Literal["csv"] = "csv"
    load_options: CSVLoadOptions | None
    save_options: CSVSaveOptions | None

    def build_dataset(self) -> CSVDataset:
        """Build a :class:`CSVDataset` using this entry's configuration.

        Returns:
            CSVDataset: A dataset configured with the resolved connection and options.
        """
        return CSVDataset(
            conn=self.get_connection(),
            load_options=self.load_options,
            save_options=self.save_options,
        )

file_format class-attribute instance-attribute

file_format: Literal['csv'] = 'csv'

load_options instance-attribute

load_options: CSVLoadOptions | None

save_options instance-attribute

save_options: CSVSaveOptions | None

build_dataset

build_dataset() -> CSVDataset

Build a :class:CSVDataset using this entry's configuration.

Returns:

Name Type Description
CSVDataset CSVDataset

A dataset configured with the resolved connection and options.

Source code in src/smallcat/catalog.py
def build_dataset(self) -> CSVDataset:
    """Build a :class:`CSVDataset` using this entry's configuration.

    Returns:
        CSVDataset: A dataset configured with the resolved connection and options.
    """
    return CSVDataset(
        conn=self.get_connection(),
        load_options=self.load_options,
        save_options=self.save_options,
    )

load_pandas

load_pandas(where: str | None = None) -> pd.DataFrame

Load this entry's dataset into a pandas DataFrame.

This method builds the concrete dataset via :meth:build_dataset and delegates to its load_pandas method using this entry's location. Any dataset-specific load options configured on the entry are respected.

Parameters:

Name Type Description Default
where str | None

Optional SQL filter predicate forwarded to the dataset.

None

Returns:

Type Description
pd.DataFrame

pd.DataFrame: The loaded tabular data.

Raises:

Type Description
FileNotFoundError

If the target path/table at location does not exist.

ValueError

If the data cannot be parsed as tabular data.

Exception

Any other error raised by the underlying dataset implementation.

Source code in src/smallcat/catalog.py
def load_pandas(self, where: str | None = None) -> "pd.DataFrame":
    """Load this entry's dataset into a pandas DataFrame.

    This method builds the concrete dataset via :meth:`build_dataset` and
    delegates to its ``load_pandas`` method using this entry's ``location``.
    Any dataset-specific load options configured on the entry are respected.

    Args:
        where: Optional SQL filter predicate forwarded to the dataset.

    Returns:
        pd.DataFrame: The loaded tabular data.

    Raises:
        FileNotFoundError: If the target path/table at ``location`` does not exist.
        ValueError: If the data cannot be parsed as tabular data.
        Exception: Any other error raised by the underlying dataset implementation.
    """
    return self.build_dataset().load_pandas(self.location, where=where)

save_pandas

save_pandas(df: pd.DataFrame) -> None

Save a pandas DataFrame to this entry's dataset location.

This method builds the concrete dataset via :meth:build_dataset and delegates to its save_pandas method using this entry's location. Any dataset-specific save options configured on the entry are respected.

Parameters:

Name Type Description Default
df pd.DataFrame

The DataFrame to persist.

required

Raises:

Type Description
PermissionError

If the target cannot be written to.

ValueError

If the DataFrame is incompatible with the target format/options.

Exception

Any other error raised by the underlying dataset implementation.

Source code in src/smallcat/catalog.py
def save_pandas(self, df: "pd.DataFrame") -> None:
    """Save a pandas DataFrame to this entry's dataset location.

    This method builds the concrete dataset via :meth:`build_dataset` and
    delegates to its ``save_pandas`` method using this entry's ``location``.
    Any dataset-specific save options configured on the entry are respected.

    Args:
        df (pd.DataFrame): The DataFrame to persist.

    Raises:
        PermissionError: If the target cannot be written to.
        ValueError: If the DataFrame is incompatible with the target format/options.
        Exception: Any other error raised by the underlying dataset implementation.
    """
    self.build_dataset().save_pandas(self.location, df)

smallcat.catalog.ExcelEntry

Bases: EntryBase

Catalog entry describing an Excel dataset.

Attributes:

Name Type Description
file_format Literal['excel']

Literal string identifying the file format: 'excel'.

load_options ExcelLoadOptions | None

Options controlling Excel reading (see ExcelLoadOptions).

save_options ExcelSaveOptions | None

Options controlling Excel writing (see ExcelSaveOptions).

Source code in src/smallcat/catalog.py
class ExcelEntry(EntryBase):
    """Catalog entry describing an Excel dataset.

    Attributes:
        file_format: Literal string identifying the file format: `'excel'`.
        load_options: Options controlling Excel *reading* (see `ExcelLoadOptions`).
        save_options: Options controlling Excel *writing* (see `ExcelSaveOptions`).
    """

    file_format: Literal["excel"] = "excel"
    load_options: ExcelLoadOptions | None
    save_options: ExcelSaveOptions | None

    def build_dataset(self) -> ExcelDataset:
        """Build an :class:`ExcelDataset` using this entry's configuration.

        Returns:
            ExcelDataset: A dataset configured with the resolved connection and options.
        """
        return ExcelDataset(
            conn=self.get_connection(),
            load_options=self.load_options,
            save_options=self.save_options,
        )

file_format class-attribute instance-attribute

file_format: Literal['excel'] = 'excel'

load_options instance-attribute

load_options: ExcelLoadOptions | None

save_options instance-attribute

save_options: ExcelSaveOptions | None

build_dataset

build_dataset() -> ExcelDataset

Build an :class:ExcelDataset using this entry's configuration.

Returns:

Name Type Description
ExcelDataset ExcelDataset

A dataset configured with the resolved connection and options.

Source code in src/smallcat/catalog.py
def build_dataset(self) -> ExcelDataset:
    """Build an :class:`ExcelDataset` using this entry's configuration.

    Returns:
        ExcelDataset: A dataset configured with the resolved connection and options.
    """
    return ExcelDataset(
        conn=self.get_connection(),
        load_options=self.load_options,
        save_options=self.save_options,
    )

load_pandas

load_pandas(where: str | None = None) -> pd.DataFrame

Load this entry's dataset into a pandas DataFrame.

This method builds the concrete dataset via :meth:build_dataset and delegates to its load_pandas method using this entry's location. Any dataset-specific load options configured on the entry are respected.

Parameters:

Name Type Description Default
where str | None

Optional SQL filter predicate forwarded to the dataset.

None

Returns:

Type Description
pd.DataFrame

pd.DataFrame: The loaded tabular data.

Raises:

Type Description
FileNotFoundError

If the target path/table at location does not exist.

ValueError

If the data cannot be parsed as tabular data.

Exception

Any other error raised by the underlying dataset implementation.

Source code in src/smallcat/catalog.py
def load_pandas(self, where: str | None = None) -> "pd.DataFrame":
    """Load this entry's dataset into a pandas DataFrame.

    This method builds the concrete dataset via :meth:`build_dataset` and
    delegates to its ``load_pandas`` method using this entry's ``location``.
    Any dataset-specific load options configured on the entry are respected.

    Args:
        where: Optional SQL filter predicate forwarded to the dataset.

    Returns:
        pd.DataFrame: The loaded tabular data.

    Raises:
        FileNotFoundError: If the target path/table at ``location`` does not exist.
        ValueError: If the data cannot be parsed as tabular data.
        Exception: Any other error raised by the underlying dataset implementation.
    """
    return self.build_dataset().load_pandas(self.location, where=where)

save_pandas

save_pandas(df: pd.DataFrame) -> None

Save a pandas DataFrame to this entry's dataset location.

This method builds the concrete dataset via :meth:build_dataset and delegates to its save_pandas method using this entry's location. Any dataset-specific save options configured on the entry are respected.

Parameters:

Name Type Description Default
df pd.DataFrame

The DataFrame to persist.

required

Raises:

Type Description
PermissionError

If the target cannot be written to.

ValueError

If the DataFrame is incompatible with the target format/options.

Exception

Any other error raised by the underlying dataset implementation.

Source code in src/smallcat/catalog.py
def save_pandas(self, df: "pd.DataFrame") -> None:
    """Save a pandas DataFrame to this entry's dataset location.

    This method builds the concrete dataset via :meth:`build_dataset` and
    delegates to its ``save_pandas`` method using this entry's ``location``.
    Any dataset-specific save options configured on the entry are respected.

    Args:
        df (pd.DataFrame): The DataFrame to persist.

    Raises:
        PermissionError: If the target cannot be written to.
        ValueError: If the DataFrame is incompatible with the target format/options.
        Exception: Any other error raised by the underlying dataset implementation.
    """
    self.build_dataset().save_pandas(self.location, df)

smallcat.catalog.ParquetEntry

Bases: EntryBase

Catalog entry describing a Parquet dataset.

Attributes:

Name Type Description
file_format Literal['parquet']

Literal string identifying the file format: 'parquet'.

load_options ParquetLoadOptions | None

Optional configuration controlling Parquet reading behavior (see :class:ParquetLoadOptions).

save_options ParquetSaveOptions | None

Optional configuration controlling Parquet writing behavior (see :class:ParquetSaveOptions).

Source code in src/smallcat/catalog.py
class ParquetEntry(EntryBase):
    """Catalog entry describing a Parquet dataset.

    Attributes:
        file_format: Literal string identifying the file format: `'parquet'`.
        load_options: Optional configuration controlling Parquet *reading*
            behavior (see :class:`ParquetLoadOptions`).
        save_options: Optional configuration controlling Parquet *writing*
            behavior (see :class:`ParquetSaveOptions`).
    """

    file_format: Literal["parquet"] = "parquet"
    load_options: ParquetLoadOptions | None
    save_options: ParquetSaveOptions | None

    def build_dataset(self) -> ParquetDataset:
        """Build a :class:`ParquetDataset` using this entry's configuration.

        Returns:
            ParquetDataset: A dataset configured with the resolved connection
            and Parquet-specific options.
        """
        return ParquetDataset(
            conn=self.get_connection(),
            load_options=self.load_options,
            save_options=self.save_options,
        )

file_format class-attribute instance-attribute

file_format: Literal['parquet'] = 'parquet'

load_options instance-attribute

load_options: ParquetLoadOptions | None

save_options instance-attribute

save_options: ParquetSaveOptions | None

build_dataset

build_dataset() -> ParquetDataset

Build a :class:ParquetDataset using this entry's configuration.

Returns:

Name Type Description
ParquetDataset ParquetDataset

A dataset configured with the resolved connection

ParquetDataset

and Parquet-specific options.

Source code in src/smallcat/catalog.py
def build_dataset(self) -> ParquetDataset:
    """Build a :class:`ParquetDataset` using this entry's configuration.

    Returns:
        ParquetDataset: A dataset configured with the resolved connection
        and Parquet-specific options.
    """
    return ParquetDataset(
        conn=self.get_connection(),
        load_options=self.load_options,
        save_options=self.save_options,
    )

load_pandas

load_pandas(where: str | None = None) -> pd.DataFrame

Load this entry's dataset into a pandas DataFrame.

This method builds the concrete dataset via :meth:build_dataset and delegates to its load_pandas method using this entry's location. Any dataset-specific load options configured on the entry are respected.

Parameters:

Name Type Description Default
where str | None

Optional SQL filter predicate forwarded to the dataset.

None

Returns:

Type Description
pd.DataFrame

pd.DataFrame: The loaded tabular data.

Raises:

Type Description
FileNotFoundError

If the target path/table at location does not exist.

ValueError

If the data cannot be parsed as tabular data.

Exception

Any other error raised by the underlying dataset implementation.

Source code in src/smallcat/catalog.py
def load_pandas(self, where: str | None = None) -> "pd.DataFrame":
    """Load this entry's dataset into a pandas DataFrame.

    This method builds the concrete dataset via :meth:`build_dataset` and
    delegates to its ``load_pandas`` method using this entry's ``location``.
    Any dataset-specific load options configured on the entry are respected.

    Args:
        where: Optional SQL filter predicate forwarded to the dataset.

    Returns:
        pd.DataFrame: The loaded tabular data.

    Raises:
        FileNotFoundError: If the target path/table at ``location`` does not exist.
        ValueError: If the data cannot be parsed as tabular data.
        Exception: Any other error raised by the underlying dataset implementation.
    """
    return self.build_dataset().load_pandas(self.location, where=where)

save_pandas

save_pandas(df: pd.DataFrame) -> None

Save a pandas DataFrame to this entry's dataset location.

This method builds the concrete dataset via :meth:build_dataset and delegates to its save_pandas method using this entry's location. Any dataset-specific save options configured on the entry are respected.

Parameters:

Name Type Description Default
df pd.DataFrame

The DataFrame to persist.

required

Raises:

Type Description
PermissionError

If the target cannot be written to.

ValueError

If the DataFrame is incompatible with the target format/options.

Exception

Any other error raised by the underlying dataset implementation.

Source code in src/smallcat/catalog.py
def save_pandas(self, df: "pd.DataFrame") -> None:
    """Save a pandas DataFrame to this entry's dataset location.

    This method builds the concrete dataset via :meth:`build_dataset` and
    delegates to its ``save_pandas`` method using this entry's ``location``.
    Any dataset-specific save options configured on the entry are respected.

    Args:
        df (pd.DataFrame): The DataFrame to persist.

    Raises:
        PermissionError: If the target cannot be written to.
        ValueError: If the DataFrame is incompatible with the target format/options.
        Exception: Any other error raised by the underlying dataset implementation.
    """
    self.build_dataset().save_pandas(self.location, df)

smallcat.catalog.DeltaTableEntry

Bases: EntryBase

Catalog entry describing a Delta Lake table dataset.

This entry specifies configuration for reading from or writing to Delta Lake tables, typically stored on local or cloud-backed storage. It includes both connection details and Delta-specific load/save options.

Attributes:

Name Type Description
file_format Literal['delta_table']

Literal string identifying the file format: 'delta_table'.

load_options DeltaTableLoadOptions | None

Optional configuration controlling Delta table reading behavior (see :class:DeltaTableLoadOptions).

save_options DeltaTableSaveOptions | None

Optional configuration controlling Delta table writing behavior (see :class:DeltaTableSaveOptions).

Source code in src/smallcat/catalog.py
class DeltaTableEntry(EntryBase):
    """Catalog entry describing a Delta Lake table dataset.

    This entry specifies configuration for reading from or writing to Delta
    Lake tables, typically stored on local or cloud-backed storage. It includes
    both connection details and Delta-specific load/save options.

    Attributes:
        file_format: Literal string identifying the file format: `'delta_table'`.
        load_options: Optional configuration controlling Delta table *reading*
            behavior (see :class:`DeltaTableLoadOptions`).
        save_options: Optional configuration controlling Delta table *writing*
            behavior (see :class:`DeltaTableSaveOptions`).
    """

    file_format: Literal["delta_table"] = "delta_table"
    load_options: DeltaTableLoadOptions | None
    save_options: DeltaTableSaveOptions | None

    def build_dataset(self) -> DeltaTableDataset:
        """Build a :class:`DeltaTableDataset` using this entry's configuration.

        Returns:
            DeltaTableDataset: A dataset configured with the resolved connection
            and Delta Lake options.
        """
        return DeltaTableDataset(
            conn=self.get_connection(),
            load_options=self.load_options,
            save_options=self.save_options,
        )

file_format class-attribute instance-attribute

file_format: Literal['delta_table'] = 'delta_table'

load_options instance-attribute

load_options: DeltaTableLoadOptions | None

save_options instance-attribute

save_options: DeltaTableSaveOptions | None

build_dataset

build_dataset() -> DeltaTableDataset

Build a :class:DeltaTableDataset using this entry's configuration.

Returns:

Name Type Description
DeltaTableDataset DeltaTableDataset

A dataset configured with the resolved connection

DeltaTableDataset

and Delta Lake options.

Source code in src/smallcat/catalog.py
def build_dataset(self) -> DeltaTableDataset:
    """Build a :class:`DeltaTableDataset` using this entry's configuration.

    Returns:
        DeltaTableDataset: A dataset configured with the resolved connection
        and Delta Lake options.
    """
    return DeltaTableDataset(
        conn=self.get_connection(),
        load_options=self.load_options,
        save_options=self.save_options,
    )

load_pandas

load_pandas(where: str | None = None) -> pd.DataFrame

Load this entry's dataset into a pandas DataFrame.

This method builds the concrete dataset via :meth:build_dataset and delegates to its load_pandas method using this entry's location. Any dataset-specific load options configured on the entry are respected.

Parameters:

Name Type Description Default
where str | None

Optional SQL filter predicate forwarded to the dataset.

None

Returns:

Type Description
pd.DataFrame

pd.DataFrame: The loaded tabular data.

Raises:

Type Description
FileNotFoundError

If the target path/table at location does not exist.

ValueError

If the data cannot be parsed as tabular data.

Exception

Any other error raised by the underlying dataset implementation.

Source code in src/smallcat/catalog.py
def load_pandas(self, where: str | None = None) -> "pd.DataFrame":
    """Load this entry's dataset into a pandas DataFrame.

    This method builds the concrete dataset via :meth:`build_dataset` and
    delegates to its ``load_pandas`` method using this entry's ``location``.
    Any dataset-specific load options configured on the entry are respected.

    Args:
        where: Optional SQL filter predicate forwarded to the dataset.

    Returns:
        pd.DataFrame: The loaded tabular data.

    Raises:
        FileNotFoundError: If the target path/table at ``location`` does not exist.
        ValueError: If the data cannot be parsed as tabular data.
        Exception: Any other error raised by the underlying dataset implementation.
    """
    return self.build_dataset().load_pandas(self.location, where=where)

save_pandas

save_pandas(df: pd.DataFrame) -> None

Save a pandas DataFrame to this entry's dataset location.

This method builds the concrete dataset via :meth:build_dataset and delegates to its save_pandas method using this entry's location. Any dataset-specific save options configured on the entry are respected.

Parameters:

Name Type Description Default
df pd.DataFrame

The DataFrame to persist.

required

Raises:

Type Description
PermissionError

If the target cannot be written to.

ValueError

If the DataFrame is incompatible with the target format/options.

Exception

Any other error raised by the underlying dataset implementation.

Source code in src/smallcat/catalog.py
def save_pandas(self, df: "pd.DataFrame") -> None:
    """Save a pandas DataFrame to this entry's dataset location.

    This method builds the concrete dataset via :meth:`build_dataset` and
    delegates to its ``save_pandas`` method using this entry's ``location``.
    Any dataset-specific save options configured on the entry are respected.

    Args:
        df (pd.DataFrame): The DataFrame to persist.

    Raises:
        PermissionError: If the target cannot be written to.
        ValueError: If the DataFrame is incompatible with the target format/options.
        Exception: Any other error raised by the underlying dataset implementation.
    """
    self.build_dataset().save_pandas(self.location, df)