Catalog¶

smallcat.catalog.Catalog ¶

Bases: BaseModel

A collection of named datasets with associated loader configuration.

The catalog maps user-defined keys to concrete dataset entries (e.g., CSV or Excel). It can be constructed from an in-memory dictionary, an Airflow Variable (JSON), or a YAML file.

Attributes:

Name	Type	Description
`entries`	`dict[str, CatalogEntry]`	Mapping of dataset names to their configurations.

Source code in src/smallcat/catalog.py

class Catalog(BaseModel):
    """A collection of named datasets with associated loader configuration.

    The catalog maps user-defined keys to concrete dataset entries (e.g., CSV or
    Excel). It can be constructed from an in-memory dictionary, an Airflow
    Variable (JSON), or a YAML file.

    Attributes:
        entries: Mapping of dataset names to their configurations.
    """

    entries: dict[str, CatalogEntry] = Field(
        ...,
        description="Named data sets",
    )

    @staticmethod
    def from_dict(dictionary: dict) -> "Catalog":
        """Create a catalog from a Python dictionary.

        The dictionary must conform to the `Catalog` schema (i.e., include an
        `entries` key mapping names to valid `CatalogEntry` objects).

        Args:
            dictionary: A dictionary matching the `Catalog` model.

        Returns:
            Catalog: A validated `Catalog` instance.

        Raises:
            pydantic.ValidationError: If the dictionary does not match the schema.
        """
        return Catalog.model_validate(dictionary)

    @staticmethod
    def from_airflow_variable(variable_id: str) -> "Catalog":
        """Create a catalog from an Airflow Variable containing JSON.

        The Airflow Variable should contain a JSON object compatible with the
        `Catalog` schema.

        Args:
            variable_id: The Airflow Variable ID to read (expects JSON).

        Returns:
            Catalog: A `Catalog` instance constructed from the Variable value.

        Raises:
            KeyError: If the Airflow Variable does not exist.
            pydantic.ValidationError: If the JSON payload is invalid for the model.
        """
        try:
            from airflow.sdk import Variable
        except ImportError:
            from airflow.models import Variable  # type: ignore[attr-defined,no-redef] # noqa: I001

        try:
            dictionary_entries = Variable.get(variable_id, deserialize_json=True)
        except TypeError:
            # LocalFilesystemBackend can return an object causing a TypeError
            # In this case we don't need to deserialize into JSON
            #  as it's not a string
            dictionary_entries = Variable.get(variable_id)
        except ImportError as e:
            # Airflow fails with import error if variable is not present and tries
            #  to talk to the Task Supervisor (the runner process) over an internal
            #  comms channel (SUPERVISOR_COMMS) to fetch it.
            msg = f"Variable {variable_id} not found in Airflow"
            raise KeyError(msg) from e
        return Catalog.from_dict(dictionary_entries)

    @staticmethod
    def from_yaml(dictionary_path: str | Path) -> "Catalog":
        """Create a catalog from a YAML file.

        Args:
            dictionary_path: Path to a YAML file whose content matches the
                `Catalog` schema.

        Returns:
            Catalog: A `Catalog` instance constructed from the YAML content.

        Raises:
            FileNotFoundError: If the YAML file cannot be found.
            pydantic.ValidationError: If the YAML content is invalid for the model.
        """
        with Path(dictionary_path).open() as f:
            catalog_dict = yaml.safe_load(f)
        return Catalog.from_dict(catalog_dict)

    def _get_entry(self, key: str) -> EntryBase:
        try:
            return self.entries[key]
        except KeyError as e:
            msg = f"Entry {key} not found in dictionary"
            raise KeyError(msg) from e

    def get_dataset(self, key: str) -> BaseDataset:
        """Instantiate a concrete dataset for a given catalog entry.

        Args:
            key: The name of the catalog entry to resolve.

        Returns:
            BaseDataset: A dataset instance ready to load/save the data.

        Raises:
            KeyError: If the key is not present in the catalog.
            ValueError: If the entry's `file_format` is not supported.
        """
        entry = self._get_entry(key)
        return entry.build_dataset()

    def load_pandas(self, key: str, where: str | None = None) -> "pd.DataFrame":
        """Load a dataset from the catalog into a pandas DataFrame.

        Resolves the catalog entry identified by ``key`` and delegates to
        :meth:`EntryBase.load_pandas`. This is equivalent to:

            ``self.entries[key].build_dataset().load_pandas(entry.location)``

        Args:
            key: The catalog entry name to load.
            where: Optional SQL filter predicate forwarded to the dataset.

        Returns:
            pd.DataFrame: The loaded tabular data.

        Raises:
            KeyError: If ``key`` is not present in the catalog.
            Exception: Any error propagated from the underlying dataset's loader.
        """
        entry = self._get_entry(key)
        return entry.load_pandas(where=where)

    def save_pandas(self, key: str, df: "pd.DataFrame") -> None:
        """Save a pandas DataFrame to a dataset in the catalog.

        Resolves the catalog entry identified by ``key`` and delegates to
        :meth:`EntryBase.save_pandas`. This writes to the entry's configured
        ``location`` with any format-specific save options applied.

        Args:
            key: The catalog entry name to write to.
            df (pd.DataFrame): The DataFrame to persist.

        Raises:
            KeyError: If ``key`` is not present in the catalog.
            Exception: Any error propagated from the underlying dataset's saver.
        """
        entry = self._get_entry(key)
        entry.save_pandas(df)

    def load_arrow(self, key: str, where: str | None = None) -> "pa.Table":
        """Load a dataset from the catalog into an Apache Arrow Table.

        Resolves the catalog entry identified by `key` and delegates to
        :meth:`EntryBase.load_arrow`. This is equivalent to:

            `self.entries[key].build_dataset().load_arrow_table(entry.location)`

        Args:
            key: The catalog entry name to load.
            where: Optional SQL filter predicate forwarded to the dataset.

        Returns:
            pa.Table: The loaded Arrow table.

        Raises:
            KeyError: If `key` is not present in the catalog.
            Exception: Any error propagated from the underlying dataset's loader.
        """
        entry = self._get_entry(key)
        return entry.load_arrow(where=where)

    def save_arrow(self, key: str, table: "pa.Table") -> None:
        """Save an Apache Arrow Table to a dataset in the catalog.

        Resolves the catalog entry identified by `key` and delegates to
        :meth:`EntryBase.save_arrow`. This writes to the entry's configured
        `location` with any format-specific save options applied.

        Args:
            key: The catalog entry name to write to.
            table (pa.Table): The Arrow table to persist.

        Raises:
            KeyError: If `key` is not present in the catalog.
            Exception: Any error propagated from the underlying dataset's saver.
        """
        entry = self._get_entry(key)
        entry.save_arrow(table)

entries `class-attribute` `instance-attribute` ¶

entries: dict[str, CatalogEntry] = Field(..., description='Named data sets')

from_dict `staticmethod` ¶

from_dict(dictionary: dict) -> Catalog

Create a catalog from a Python dictionary.

The dictionary must conform to the Catalog schema (i.e., include an entries key mapping names to valid CatalogEntry objects).

Parameters:

Name	Type	Description	Default
`dictionary`	`dict`	A dictionary matching the `Catalog` model.	required

Returns:

Name	Type	Description
`Catalog`	`Catalog`	A validated `Catalog` instance.

Raises:

Type	Description
`pydantic.ValidationError`	If the dictionary does not match the schema.

Source code in src/smallcat/catalog.py

@staticmethod
def from_dict(dictionary: dict) -> "Catalog":
    """Create a catalog from a Python dictionary.

    The dictionary must conform to the `Catalog` schema (i.e., include an
    `entries` key mapping names to valid `CatalogEntry` objects).

    Args:
        dictionary: A dictionary matching the `Catalog` model.

    Returns:
        Catalog: A validated `Catalog` instance.

    Raises:
        pydantic.ValidationError: If the dictionary does not match the schema.
    """
    return Catalog.model_validate(dictionary)

from_airflow_variable `staticmethod` ¶

from_airflow_variable(variable_id: str) -> Catalog

Create a catalog from an Airflow Variable containing JSON.

The Airflow Variable should contain a JSON object compatible with the Catalog schema.

Parameters:

Name	Type	Description	Default
`variable_id`	`str`	The Airflow Variable ID to read (expects JSON).	required

Returns:

Name	Type	Description
`Catalog`	`Catalog`	A `Catalog` instance constructed from the Variable value.

Raises:

Type	Description
`KeyError`	If the Airflow Variable does not exist.
`pydantic.ValidationError`	If the JSON payload is invalid for the model.

Source code in src/smallcat/catalog.py

@staticmethod
def from_airflow_variable(variable_id: str) -> "Catalog":
    """Create a catalog from an Airflow Variable containing JSON.

    The Airflow Variable should contain a JSON object compatible with the
    `Catalog` schema.

    Args:
        variable_id: The Airflow Variable ID to read (expects JSON).

    Returns:
        Catalog: A `Catalog` instance constructed from the Variable value.

    Raises:
        KeyError: If the Airflow Variable does not exist.
        pydantic.ValidationError: If the JSON payload is invalid for the model.
    """
    try:
        from airflow.sdk import Variable
    except ImportError:
        from airflow.models import Variable  # type: ignore[attr-defined,no-redef] # noqa: I001

    try:
        dictionary_entries = Variable.get(variable_id, deserialize_json=True)
    except TypeError:
        # LocalFilesystemBackend can return an object causing a TypeError
        # In this case we don't need to deserialize into JSON
        #  as it's not a string
        dictionary_entries = Variable.get(variable_id)
    except ImportError as e:
        # Airflow fails with import error if variable is not present and tries
        #  to talk to the Task Supervisor (the runner process) over an internal
        #  comms channel (SUPERVISOR_COMMS) to fetch it.
        msg = f"Variable {variable_id} not found in Airflow"
        raise KeyError(msg) from e
    return Catalog.from_dict(dictionary_entries)

from_yaml `staticmethod` ¶

from_yaml(dictionary_path: str | Path) -> Catalog

Create a catalog from a YAML file.

Parameters:

Name	Type	Description	Default
`dictionary_path`	`str \| Path`	Path to a YAML file whose content matches the `Catalog` schema.	required

Returns:

Name	Type	Description
`Catalog`	`Catalog`	A `Catalog` instance constructed from the YAML content.

Raises:

Type	Description
`FileNotFoundError`	If the YAML file cannot be found.
`pydantic.ValidationError`	If the YAML content is invalid for the model.

Source code in src/smallcat/catalog.py

@staticmethod
def from_yaml(dictionary_path: str | Path) -> "Catalog":
    """Create a catalog from a YAML file.

    Args:
        dictionary_path: Path to a YAML file whose content matches the
            `Catalog` schema.

    Returns:
        Catalog: A `Catalog` instance constructed from the YAML content.

    Raises:
        FileNotFoundError: If the YAML file cannot be found.
        pydantic.ValidationError: If the YAML content is invalid for the model.
    """
    with Path(dictionary_path).open() as f:
        catalog_dict = yaml.safe_load(f)
    return Catalog.from_dict(catalog_dict)

_get_entry ¶

_get_entry(key: str) -> EntryBase

Source code in src/smallcat/catalog.py

def _get_entry(self, key: str) -> EntryBase:
    try:
        return self.entries[key]
    except KeyError as e:
        msg = f"Entry {key} not found in dictionary"
        raise KeyError(msg) from e

get_dataset ¶

get_dataset(key: str) -> BaseDataset

Instantiate a concrete dataset for a given catalog entry.

Parameters:

Name	Type	Description	Default
`key`	`str`	The name of the catalog entry to resolve.	required

Returns:

Name	Type	Description
`BaseDataset`	`BaseDataset`	A dataset instance ready to load/save the data.

Raises:

Type	Description
`KeyError`	If the key is not present in the catalog.
`ValueError`	If the entry's `file_format` is not supported.

Source code in src/smallcat/catalog.py

def get_dataset(self, key: str) -> BaseDataset:
    """Instantiate a concrete dataset for a given catalog entry.

    Args:
        key: The name of the catalog entry to resolve.

    Returns:
        BaseDataset: A dataset instance ready to load/save the data.

    Raises:
        KeyError: If the key is not present in the catalog.
        ValueError: If the entry's `file_format` is not supported.
    """
    entry = self._get_entry(key)
    return entry.build_dataset()

load_pandas ¶

load_pandas(key: str, where: str | None = None) -> pd.DataFrame

Load a dataset from the catalog into a pandas DataFrame.

Resolves the catalog entry identified by key and delegates to :meth:EntryBase.load_pandas. This is equivalent to:

``self.entries[key].build_dataset().load_pandas(entry.location)``

Parameters:

Name	Type	Description	Default
`key`	`str`	The catalog entry name to load.	required
`where`	`str \| None`	Optional SQL filter predicate forwarded to the dataset.	`None`

Returns:

Type	Description
`pd.DataFrame`	pd.DataFrame: The loaded tabular data.

Raises:

Type	Description
`KeyError`	If `key` is not present in the catalog.
`Exception`	Any error propagated from the underlying dataset's loader.

Source code in src/smallcat/catalog.py

def load_pandas(self, key: str, where: str | None = None) -> "pd.DataFrame":
    """Load a dataset from the catalog into a pandas DataFrame.

    Resolves the catalog entry identified by ``key`` and delegates to
    :meth:`EntryBase.load_pandas`. This is equivalent to:

        ``self.entries[key].build_dataset().load_pandas(entry.location)``

    Args:
        key: The catalog entry name to load.
        where: Optional SQL filter predicate forwarded to the dataset.

    Returns:
        pd.DataFrame: The loaded tabular data.

    Raises:
        KeyError: If ``key`` is not present in the catalog.
        Exception: Any error propagated from the underlying dataset's loader.
    """
    entry = self._get_entry(key)
    return entry.load_pandas(where=where)

save_pandas ¶

save_pandas(key: str, df: pd.DataFrame) -> None

Save a pandas DataFrame to a dataset in the catalog.

Resolves the catalog entry identified by key and delegates to :meth:EntryBase.save_pandas. This writes to the entry's configured location with any format-specific save options applied.

Parameters:

Name	Type	Description	Default
`key`	`str`	The catalog entry name to write to.	required
`df`	`pd.DataFrame`	The DataFrame to persist.	required

Raises:

Type	Description
`KeyError`	If `key` is not present in the catalog.
`Exception`	Any error propagated from the underlying dataset's saver.

Source code in src/smallcat/catalog.py

def save_pandas(self, key: str, df: "pd.DataFrame") -> None:
    """Save a pandas DataFrame to a dataset in the catalog.

    Resolves the catalog entry identified by ``key`` and delegates to
    :meth:`EntryBase.save_pandas`. This writes to the entry's configured
    ``location`` with any format-specific save options applied.

    Args:
        key: The catalog entry name to write to.
        df (pd.DataFrame): The DataFrame to persist.

    Raises:
        KeyError: If ``key`` is not present in the catalog.
        Exception: Any error propagated from the underlying dataset's saver.
    """
    entry = self._get_entry(key)
    entry.save_pandas(df)

load_arrow ¶

load_arrow(key: str, where: str | None = None) -> pa.Table

Load a dataset from the catalog into an Apache Arrow Table.

Resolves the catalog entry identified by key and delegates to :meth:EntryBase.load_arrow. This is equivalent to:

`self.entries[key].build_dataset().load_arrow_table(entry.location)`

Parameters:

Name	Type	Description	Default
`key`	`str`	The catalog entry name to load.	required
`where`	`str \| None`	Optional SQL filter predicate forwarded to the dataset.	`None`

Returns:

Type	Description
`pa.Table`	pa.Table: The loaded Arrow table.

Raises:

Type	Description
`KeyError`	If `key` is not present in the catalog.
`Exception`	Any error propagated from the underlying dataset's loader.

Source code in src/smallcat/catalog.py

def load_arrow(self, key: str, where: str | None = None) -> "pa.Table":
    """Load a dataset from the catalog into an Apache Arrow Table.

    Resolves the catalog entry identified by `key` and delegates to
    :meth:`EntryBase.load_arrow`. This is equivalent to:

        `self.entries[key].build_dataset().load_arrow_table(entry.location)`

    Args:
        key: The catalog entry name to load.
        where: Optional SQL filter predicate forwarded to the dataset.

    Returns:
        pa.Table: The loaded Arrow table.

    Raises:
        KeyError: If `key` is not present in the catalog.
        Exception: Any error propagated from the underlying dataset's loader.
    """
    entry = self._get_entry(key)
    return entry.load_arrow(where=where)

save_arrow ¶

save_arrow(key: str, table: pa.Table) -> None

Save an Apache Arrow Table to a dataset in the catalog.

Resolves the catalog entry identified by key and delegates to :meth:EntryBase.save_arrow. This writes to the entry's configured location with any format-specific save options applied.

Parameters:

Name	Type	Description	Default
`key`	`str`	The catalog entry name to write to.	required
`table`	`pa.Table`	The Arrow table to persist.	required

Raises:

Type	Description
`KeyError`	If `key` is not present in the catalog.
`Exception`	Any error propagated from the underlying dataset's saver.

Source code in src/smallcat/catalog.py

def save_arrow(self, key: str, table: "pa.Table") -> None:
    """Save an Apache Arrow Table to a dataset in the catalog.

    Resolves the catalog entry identified by `key` and delegates to
    :meth:`EntryBase.save_arrow`. This writes to the entry's configured
    `location` with any format-specific save options applied.

    Args:
        key: The catalog entry name to write to.
        table (pa.Table): The Arrow table to persist.

    Raises:
        KeyError: If `key` is not present in the catalog.
        Exception: Any error propagated from the underlying dataset's saver.
    """
    entry = self._get_entry(key)
    entry.save_arrow(table)

Entries¶

smallcat.catalog.CSVEntry ¶

Bases: EntryBase

Catalog entry describing a CSV dataset.

Attributes:

Name	Type	Description
`file_format`	`Literal['csv']`	Literal string identifying the file format: `'csv'`.
`load_options`	`CSVLoadOptions \| None`	Options controlling CSV reading (see `CSVLoadOptions`).
`save_options`	`CSVSaveOptions \| None`	Options controlling CSV writing (see `CSVSaveOptions`).

Source code in src/smallcat/catalog.py

class CSVEntry(EntryBase):
    """Catalog entry describing a CSV dataset.

    Attributes:
        file_format: Literal string identifying the file format: `'csv'`.
        load_options: Options controlling CSV *reading* (see `CSVLoadOptions`).
        save_options: Options controlling CSV *writing* (see `CSVSaveOptions`).
    """

    file_format: Literal["csv"] = "csv"
    load_options: CSVLoadOptions | None
    save_options: CSVSaveOptions | None

    def build_dataset(self) -> CSVDataset:
        """Build a :class:`CSVDataset` using this entry's configuration.

        Returns:
            CSVDataset: A dataset configured with the resolved connection and options.
        """
        return CSVDataset(
            conn=self.get_connection(),
            load_options=self.load_options,
            save_options=self.save_options,
        )

file_format `class-attribute` `instance-attribute` ¶

file_format: Literal['csv'] = 'csv'

load_options `instance-attribute` ¶

load_options: CSVLoadOptions | None

save_options `instance-attribute` ¶

save_options: CSVSaveOptions | None

build_dataset ¶

build_dataset() -> CSVDataset

Build a :class:CSVDataset using this entry's configuration.

Returns:

Name	Type	Description
`CSVDataset`	`CSVDataset`	A dataset configured with the resolved connection and options.

Source code in src/smallcat/catalog.py

def build_dataset(self) -> CSVDataset:
    """Build a :class:`CSVDataset` using this entry's configuration.

    Returns:
        CSVDataset: A dataset configured with the resolved connection and options.
    """
    return CSVDataset(
        conn=self.get_connection(),
        load_options=self.load_options,
        save_options=self.save_options,
    )

load_pandas ¶

load_pandas(where: str | None = None) -> pd.DataFrame

Load this entry's dataset into a pandas DataFrame.

This method builds the concrete dataset via :meth:build_dataset and delegates to its load_pandas method using this entry's location. Any dataset-specific load options configured on the entry are respected.

Parameters:

Name	Type	Description	Default
`where`	`str \| None`	Optional SQL filter predicate forwarded to the dataset.	`None`

Returns:

Type	Description
`pd.DataFrame`	pd.DataFrame: The loaded tabular data.

Raises:

Type	Description
`FileNotFoundError`	If the target path/table at `location` does not exist.
`ValueError`	If the data cannot be parsed as tabular data.
`Exception`	Any other error raised by the underlying dataset implementation.

Source code in src/smallcat/catalog.py

def load_pandas(self, where: str | None = None) -> "pd.DataFrame":
    """Load this entry's dataset into a pandas DataFrame.

    This method builds the concrete dataset via :meth:`build_dataset` and
    delegates to its ``load_pandas`` method using this entry's ``location``.
    Any dataset-specific load options configured on the entry are respected.

    Args:
        where: Optional SQL filter predicate forwarded to the dataset.

    Returns:
        pd.DataFrame: The loaded tabular data.

    Raises:
        FileNotFoundError: If the target path/table at ``location`` does not exist.
        ValueError: If the data cannot be parsed as tabular data.
        Exception: Any other error raised by the underlying dataset implementation.
    """
    return self.build_dataset().load_pandas(self.location, where=where)

save_pandas ¶

save_pandas(df: pd.DataFrame) -> None

Save a pandas DataFrame to this entry's dataset location.

This method builds the concrete dataset via :meth:build_dataset and delegates to its save_pandas method using this entry's location. Any dataset-specific save options configured on the entry are respected.

Parameters:

Name	Type	Description	Default
`df`	`pd.DataFrame`	The DataFrame to persist.	required

Raises:

Type	Description
`PermissionError`	If the target cannot be written to.
`ValueError`	If the DataFrame is incompatible with the target format/options.
`Exception`	Any other error raised by the underlying dataset implementation.

Source code in src/smallcat/catalog.py

def save_pandas(self, df: "pd.DataFrame") -> None:
    """Save a pandas DataFrame to this entry's dataset location.

    This method builds the concrete dataset via :meth:`build_dataset` and
    delegates to its ``save_pandas`` method using this entry's ``location``.
    Any dataset-specific save options configured on the entry are respected.

    Args:
        df (pd.DataFrame): The DataFrame to persist.

    Raises:
        PermissionError: If the target cannot be written to.
        ValueError: If the DataFrame is incompatible with the target format/options.
        Exception: Any other error raised by the underlying dataset implementation.
    """
    self.build_dataset().save_pandas(self.location, df)

smallcat.catalog.ExcelEntry ¶

Bases: EntryBase

Catalog entry describing an Excel dataset.

Attributes:

Name	Type	Description
`file_format`	`Literal['excel']`	Literal string identifying the file format: `'excel'`.
`load_options`	`ExcelLoadOptions \| None`	Options controlling Excel reading (see `ExcelLoadOptions`).
`save_options`	`ExcelSaveOptions \| None`	Options controlling Excel writing (see `ExcelSaveOptions`).

Source code in src/smallcat/catalog.py

class ExcelEntry(EntryBase):
    """Catalog entry describing an Excel dataset.

    Attributes:
        file_format: Literal string identifying the file format: `'excel'`.
        load_options: Options controlling Excel *reading* (see `ExcelLoadOptions`).
        save_options: Options controlling Excel *writing* (see `ExcelSaveOptions`).
    """

    file_format: Literal["excel"] = "excel"
    load_options: ExcelLoadOptions | None
    save_options: ExcelSaveOptions | None

    def build_dataset(self) -> ExcelDataset:
        """Build an :class:`ExcelDataset` using this entry's configuration.

        Returns:
            ExcelDataset: A dataset configured with the resolved connection and options.
        """
        return ExcelDataset(
            conn=self.get_connection(),
            load_options=self.load_options,
            save_options=self.save_options,
        )

file_format `class-attribute` `instance-attribute` ¶

file_format: Literal['excel'] = 'excel'

load_options `instance-attribute` ¶

load_options: ExcelLoadOptions | None

save_options `instance-attribute` ¶

save_options: ExcelSaveOptions | None

build_dataset ¶

build_dataset() -> ExcelDataset

Build an :class:ExcelDataset using this entry's configuration.

Returns:

Name	Type	Description
`ExcelDataset`	`ExcelDataset`	A dataset configured with the resolved connection and options.

Source code in src/smallcat/catalog.py

def build_dataset(self) -> ExcelDataset:
    """Build an :class:`ExcelDataset` using this entry's configuration.

    Returns:
        ExcelDataset: A dataset configured with the resolved connection and options.
    """
    return ExcelDataset(
        conn=self.get_connection(),
        load_options=self.load_options,
        save_options=self.save_options,
    )

load_pandas ¶

load_pandas(where: str | None = None) -> pd.DataFrame

Load this entry's dataset into a pandas DataFrame.

This method builds the concrete dataset via :meth:build_dataset and delegates to its load_pandas method using this entry's location. Any dataset-specific load options configured on the entry are respected.

Parameters:

Name	Type	Description	Default
`where`	`str \| None`	Optional SQL filter predicate forwarded to the dataset.	`None`

Returns:

Type	Description
`pd.DataFrame`	pd.DataFrame: The loaded tabular data.

Raises:

Type	Description
`FileNotFoundError`	If the target path/table at `location` does not exist.
`ValueError`	If the data cannot be parsed as tabular data.
`Exception`	Any other error raised by the underlying dataset implementation.

Source code in src/smallcat/catalog.py

def load_pandas(self, where: str | None = None) -> "pd.DataFrame":
    """Load this entry's dataset into a pandas DataFrame.

    This method builds the concrete dataset via :meth:`build_dataset` and
    delegates to its ``load_pandas`` method using this entry's ``location``.
    Any dataset-specific load options configured on the entry are respected.

    Args:
        where: Optional SQL filter predicate forwarded to the dataset.

    Returns:
        pd.DataFrame: The loaded tabular data.

    Raises:
        FileNotFoundError: If the target path/table at ``location`` does not exist.
        ValueError: If the data cannot be parsed as tabular data.
        Exception: Any other error raised by the underlying dataset implementation.
    """
    return self.build_dataset().load_pandas(self.location, where=where)

save_pandas ¶

save_pandas(df: pd.DataFrame) -> None

Save a pandas DataFrame to this entry's dataset location.

This method builds the concrete dataset via :meth:build_dataset and delegates to its save_pandas method using this entry's location. Any dataset-specific save options configured on the entry are respected.

Parameters:

Name	Type	Description	Default
`df`	`pd.DataFrame`	The DataFrame to persist.	required

Raises:

Type	Description
`PermissionError`	If the target cannot be written to.
`ValueError`	If the DataFrame is incompatible with the target format/options.
`Exception`	Any other error raised by the underlying dataset implementation.

Source code in src/smallcat/catalog.py

def save_pandas(self, df: "pd.DataFrame") -> None:
    """Save a pandas DataFrame to this entry's dataset location.

    This method builds the concrete dataset via :meth:`build_dataset` and
    delegates to its ``save_pandas`` method using this entry's ``location``.
    Any dataset-specific save options configured on the entry are respected.

    Args:
        df (pd.DataFrame): The DataFrame to persist.

    Raises:
        PermissionError: If the target cannot be written to.
        ValueError: If the DataFrame is incompatible with the target format/options.
        Exception: Any other error raised by the underlying dataset implementation.
    """
    self.build_dataset().save_pandas(self.location, df)

smallcat.catalog.ParquetEntry ¶

Bases: EntryBase

Catalog entry describing a Parquet dataset.

Attributes:

Name	Type	Description
`file_format`	`Literal['parquet']`	Literal string identifying the file format: `'parquet'`.
`load_options`	`ParquetLoadOptions \| None`	Optional configuration controlling Parquet reading behavior (see :class:`ParquetLoadOptions`).
`save_options`	`ParquetSaveOptions \| None`	Optional configuration controlling Parquet writing behavior (see :class:`ParquetSaveOptions`).

Source code in src/smallcat/catalog.py

class ParquetEntry(EntryBase):
    """Catalog entry describing a Parquet dataset.

    Attributes:
        file_format: Literal string identifying the file format: `'parquet'`.
        load_options: Optional configuration controlling Parquet *reading*
            behavior (see :class:`ParquetLoadOptions`).
        save_options: Optional configuration controlling Parquet *writing*
            behavior (see :class:`ParquetSaveOptions`).
    """

    file_format: Literal["parquet"] = "parquet"
    load_options: ParquetLoadOptions | None
    save_options: ParquetSaveOptions | None

    def build_dataset(self) -> ParquetDataset:
        """Build a :class:`ParquetDataset` using this entry's configuration.

        Returns:
            ParquetDataset: A dataset configured with the resolved connection
            and Parquet-specific options.
        """
        return ParquetDataset(
            conn=self.get_connection(),
            load_options=self.load_options,
            save_options=self.save_options,
        )

file_format `class-attribute` `instance-attribute` ¶

file_format: Literal['parquet'] = 'parquet'

load_options `instance-attribute` ¶

load_options: ParquetLoadOptions | None

save_options `instance-attribute` ¶

save_options: ParquetSaveOptions | None

build_dataset ¶

build_dataset() -> ParquetDataset

Build a :class:ParquetDataset using this entry's configuration.

Returns:

Name	Type	Description
`ParquetDataset`	`ParquetDataset`	A dataset configured with the resolved connection
	`ParquetDataset`	and Parquet-specific options.

Source code in src/smallcat/catalog.py

def build_dataset(self) -> ParquetDataset:
    """Build a :class:`ParquetDataset` using this entry's configuration.

    Returns:
        ParquetDataset: A dataset configured with the resolved connection
        and Parquet-specific options.
    """
    return ParquetDataset(
        conn=self.get_connection(),
        load_options=self.load_options,
        save_options=self.save_options,
    )

load_pandas ¶

load_pandas(where: str | None = None) -> pd.DataFrame

Load this entry's dataset into a pandas DataFrame.

This method builds the concrete dataset via :meth:build_dataset and delegates to its load_pandas method using this entry's location. Any dataset-specific load options configured on the entry are respected.

Parameters:

Name	Type	Description	Default
`where`	`str \| None`	Optional SQL filter predicate forwarded to the dataset.	`None`

Returns:

Type	Description
`pd.DataFrame`	pd.DataFrame: The loaded tabular data.

Raises:

Type	Description
`FileNotFoundError`	If the target path/table at `location` does not exist.
`ValueError`	If the data cannot be parsed as tabular data.
`Exception`	Any other error raised by the underlying dataset implementation.

Source code in src/smallcat/catalog.py

def load_pandas(self, where: str | None = None) -> "pd.DataFrame":
    """Load this entry's dataset into a pandas DataFrame.

    This method builds the concrete dataset via :meth:`build_dataset` and
    delegates to its ``load_pandas`` method using this entry's ``location``.
    Any dataset-specific load options configured on the entry are respected.

    Args:
        where: Optional SQL filter predicate forwarded to the dataset.

    Returns:
        pd.DataFrame: The loaded tabular data.

    Raises:
        FileNotFoundError: If the target path/table at ``location`` does not exist.
        ValueError: If the data cannot be parsed as tabular data.
        Exception: Any other error raised by the underlying dataset implementation.
    """
    return self.build_dataset().load_pandas(self.location, where=where)

save_pandas ¶

save_pandas(df: pd.DataFrame) -> None

Save a pandas DataFrame to this entry's dataset location.

This method builds the concrete dataset via :meth:build_dataset and delegates to its save_pandas method using this entry's location. Any dataset-specific save options configured on the entry are respected.

Parameters:

Name	Type	Description	Default
`df`	`pd.DataFrame`	The DataFrame to persist.	required

Raises:

Type	Description
`PermissionError`	If the target cannot be written to.
`ValueError`	If the DataFrame is incompatible with the target format/options.
`Exception`	Any other error raised by the underlying dataset implementation.

Source code in src/smallcat/catalog.py

def save_pandas(self, df: "pd.DataFrame") -> None:
    """Save a pandas DataFrame to this entry's dataset location.

    This method builds the concrete dataset via :meth:`build_dataset` and
    delegates to its ``save_pandas`` method using this entry's ``location``.
    Any dataset-specific save options configured on the entry are respected.

    Args:
        df (pd.DataFrame): The DataFrame to persist.

    Raises:
        PermissionError: If the target cannot be written to.
        ValueError: If the DataFrame is incompatible with the target format/options.
        Exception: Any other error raised by the underlying dataset implementation.
    """
    self.build_dataset().save_pandas(self.location, df)

smallcat.catalog.DeltaTableEntry ¶

Bases: EntryBase

Catalog entry describing a Delta Lake table dataset.

This entry specifies configuration for reading from or writing to Delta Lake tables, typically stored on local or cloud-backed storage. It includes both connection details and Delta-specific load/save options.

Attributes:

Name	Type	Description
`file_format`	`Literal['delta_table']`	Literal string identifying the file format: `'delta_table'`.
`load_options`	`DeltaTableLoadOptions \| None`	Optional configuration controlling Delta table reading behavior (see :class:`DeltaTableLoadOptions`).
`save_options`	`DeltaTableSaveOptions \| None`	Optional configuration controlling Delta table writing behavior (see :class:`DeltaTableSaveOptions`).

Source code in src/smallcat/catalog.py

class DeltaTableEntry(EntryBase):
    """Catalog entry describing a Delta Lake table dataset.

    This entry specifies configuration for reading from or writing to Delta
    Lake tables, typically stored on local or cloud-backed storage. It includes
    both connection details and Delta-specific load/save options.

    Attributes:
        file_format: Literal string identifying the file format: `'delta_table'`.
        load_options: Optional configuration controlling Delta table *reading*
            behavior (see :class:`DeltaTableLoadOptions`).
        save_options: Optional configuration controlling Delta table *writing*
            behavior (see :class:`DeltaTableSaveOptions`).
    """

    file_format: Literal["delta_table"] = "delta_table"
    load_options: DeltaTableLoadOptions | None
    save_options: DeltaTableSaveOptions | None

    def build_dataset(self) -> DeltaTableDataset:
        """Build a :class:`DeltaTableDataset` using this entry's configuration.

        Returns:
            DeltaTableDataset: A dataset configured with the resolved connection
            and Delta Lake options.
        """
        return DeltaTableDataset(
            conn=self.get_connection(),
            load_options=self.load_options,
            save_options=self.save_options,
        )

file_format `class-attribute` `instance-attribute` ¶

file_format: Literal['delta_table'] = 'delta_table'

load_options `instance-attribute` ¶

load_options: DeltaTableLoadOptions | None

save_options `instance-attribute` ¶

save_options: DeltaTableSaveOptions | None

build_dataset ¶

build_dataset() -> DeltaTableDataset

Build a :class:DeltaTableDataset using this entry's configuration.

Returns:

Name	Type	Description
`DeltaTableDataset`	`DeltaTableDataset`	A dataset configured with the resolved connection
	`DeltaTableDataset`	and Delta Lake options.

Source code in src/smallcat/catalog.py

def build_dataset(self) -> DeltaTableDataset:
    """Build a :class:`DeltaTableDataset` using this entry's configuration.

    Returns:
        DeltaTableDataset: A dataset configured with the resolved connection
        and Delta Lake options.
    """
    return DeltaTableDataset(
        conn=self.get_connection(),
        load_options=self.load_options,
        save_options=self.save_options,
    )

load_pandas ¶

load_pandas(where: str | None = None) -> pd.DataFrame

Load this entry's dataset into a pandas DataFrame.

This method builds the concrete dataset via :meth:build_dataset and delegates to its load_pandas method using this entry's location. Any dataset-specific load options configured on the entry are respected.

Parameters:

Name	Type	Description	Default
`where`	`str \| None`	Optional SQL filter predicate forwarded to the dataset.	`None`

Returns:

Type	Description
`pd.DataFrame`	pd.DataFrame: The loaded tabular data.

Raises:

Type	Description
`FileNotFoundError`	If the target path/table at `location` does not exist.
`ValueError`	If the data cannot be parsed as tabular data.
`Exception`	Any other error raised by the underlying dataset implementation.

Source code in src/smallcat/catalog.py

def load_pandas(self, where: str | None = None) -> "pd.DataFrame":
    """Load this entry's dataset into a pandas DataFrame.

    This method builds the concrete dataset via :meth:`build_dataset` and
    delegates to its ``load_pandas`` method using this entry's ``location``.
    Any dataset-specific load options configured on the entry are respected.

    Args:
        where: Optional SQL filter predicate forwarded to the dataset.

    Returns:
        pd.DataFrame: The loaded tabular data.

    Raises:
        FileNotFoundError: If the target path/table at ``location`` does not exist.
        ValueError: If the data cannot be parsed as tabular data.
        Exception: Any other error raised by the underlying dataset implementation.
    """
    return self.build_dataset().load_pandas(self.location, where=where)

save_pandas ¶

save_pandas(df: pd.DataFrame) -> None

Save a pandas DataFrame to this entry's dataset location.

This method builds the concrete dataset via :meth:build_dataset and delegates to its save_pandas method using this entry's location. Any dataset-specific save options configured on the entry are respected.

Parameters:

Name	Type	Description	Default
`df`	`pd.DataFrame`	The DataFrame to persist.	required

Raises:

Type	Description
`PermissionError`	If the target cannot be written to.
`ValueError`	If the DataFrame is incompatible with the target format/options.
`Exception`	Any other error raised by the underlying dataset implementation.

Source code in src/smallcat/catalog.py

def save_pandas(self, df: "pd.DataFrame") -> None:
    """Save a pandas DataFrame to this entry's dataset location.

    This method builds the concrete dataset via :meth:`build_dataset` and
    delegates to its ``save_pandas`` method using this entry's ``location``.
    Any dataset-specific save options configured on the entry are respected.

    Args:
        df (pd.DataFrame): The DataFrame to persist.

    Raises:
        PermissionError: If the target cannot be written to.
        ValueError: If the DataFrame is incompatible with the target format/options.
        Exception: Any other error raised by the underlying dataset implementation.
    """
    self.build_dataset().save_pandas(self.location, df)

Catalog¶

smallcat.catalog.Catalog ¶

entries class-attribute instance-attribute ¶

from_dict staticmethod ¶

from_airflow_variable staticmethod ¶

from_yaml staticmethod ¶

_get_entry ¶

get_dataset ¶

load_pandas ¶

save_pandas ¶

load_arrow ¶

save_arrow ¶

Entries¶

smallcat.catalog.CSVEntry ¶

file_format class-attribute instance-attribute ¶

load_options instance-attribute ¶

save_options instance-attribute ¶

build_dataset ¶

load_pandas ¶

save_pandas ¶

smallcat.catalog.ExcelEntry ¶

file_format class-attribute instance-attribute ¶

load_options instance-attribute ¶

save_options instance-attribute ¶

build_dataset ¶

load_pandas ¶

save_pandas ¶

smallcat.catalog.ParquetEntry ¶

file_format class-attribute instance-attribute ¶

load_options instance-attribute ¶

save_options instance-attribute ¶

build_dataset ¶

load_pandas ¶

save_pandas ¶

smallcat.catalog.DeltaTableEntry ¶

file_format class-attribute instance-attribute ¶

load_options instance-attribute ¶

save_options instance-attribute ¶

build_dataset ¶

load_pandas ¶

save_pandas ¶

entries `class-attribute` `instance-attribute` ¶

from_dict `staticmethod` ¶

from_airflow_variable `staticmethod` ¶

from_yaml `staticmethod` ¶

file_format `class-attribute` `instance-attribute` ¶

load_options `instance-attribute` ¶

save_options `instance-attribute` ¶

file_format `class-attribute` `instance-attribute` ¶

load_options `instance-attribute` ¶

save_options `instance-attribute` ¶

file_format `class-attribute` `instance-attribute` ¶

load_options `instance-attribute` ¶

save_options `instance-attribute` ¶

file_format `class-attribute` `instance-attribute` ¶

load_options `instance-attribute` ¶

save_options `instance-attribute` ¶