Module datatap.api.entities.dataset
Expand source code
from __future__ import annotations
from datatap.api.types.dataset import JsonDatasetRepository
from typing import Generator, Generic, List, TypeVar, Union, overload
from datatap.droplet import ImageAnnotation, VideoAnnotation
from datatap.template import ImageAnnotationTemplate, VideoAnnotationTemplate
from datatap.utils import basic_repr
from ..endpoints import ApiEndpoints
from ..types import JsonDataset
T = TypeVar("T", ImageAnnotationTemplate, VideoAnnotationTemplate)
class DatasetRepository:
"""
An object representing the repository a dataset came from.
"""
name: str
"""
The name of the repository.
"""
namespace: str
"""
The namespace of the repository.
"""
@staticmethod
def from_json(json: JsonDatasetRepository) -> DatasetRepository:
"""
Creates a new `DatasetRepository` from a `JsonDatasetRepository`.
"""
return DatasetRepository(name = json["name"], namespace = json["namespace"])
def __init__(self, *, name: str, namespace: str):
self.name = name
self.namespace = namespace
class Dataset(Generic[T]):
"""
Represents a concrete version of a dataset. Critically, `Dataset`s cannot be changed
once they're created.
For reproducable training, ensure that you store the specific `Dataset` used
during training.
"""
_endpoints: ApiEndpoints
uid: str
"""
The UID of this `Dataset`.
"""
database: str
"""
The UID of the database in which this dataset lives.
"""
repository: DatasetRepository
"""
The repository this dataset belongs to.
"""
splits: List[str]
"""
A list of all the splits that this dataset has. By default, this will be
`["training", "validation"]`.
"""
template: T
"""
The template that all annotations in this dataset version adhere to.
"""
@staticmethod
def from_json(endpoints: ApiEndpoints, json: JsonDataset) -> AnyDataset:
"""
Creates a new `Dataset` from a `JsonDataset`.
"""
template_json = json["template"]
template: Union[ImageAnnotationTemplate, VideoAnnotationTemplate]
if template_json["kind"] == "ImageAnnotationTemplate":
template = ImageAnnotationTemplate.from_json(template_json)
elif template_json["kind"] == "VideoAnnotationTemplate":
template = VideoAnnotationTemplate.from_json(template_json)
else:
raise ValueError(f"Unknown template kind: {template_json['kind']}")
return Dataset(
endpoints,
uid = json["uid"],
database = json["database"],
repository = DatasetRepository.from_json(json["repository"]),
splits = json["splits"],
template = template
)
def __init__(
self,
endpoints: ApiEndpoints,
uid: str,
*,
database: str,
repository: DatasetRepository,
splits: List[str],
template: Union[ImageAnnotationTemplate, VideoAnnotationTemplate]
):
self._endpoints = endpoints
self.uid = uid
self.database = database
self.repository = repository
self.splits = splits
self.template = template
@overload
def stream_split(
self: Dataset[ImageAnnotationTemplate],
split: str
) -> Generator[ImageAnnotation, None, None]: ...
@overload
def stream_split(
self: Dataset[ImageAnnotationTemplate],
split: str,
chunk: int,
nchunks: int
) -> Generator[ImageAnnotation, None, None]: ...
@overload
def stream_split(
self: Dataset[VideoAnnotationTemplate],
split: str
) -> Generator[VideoAnnotation, None, None]: ...
@overload
def stream_split(
self: Dataset[VideoAnnotationTemplate],
split: str,
chunk: int,
nchunks: int
) -> Generator[VideoAnnotation, None, None]: ...
def stream_split(
self,
split: str,
chunk: int = 0,
nchunks: int = 1
) -> Generator[Union[ImageAnnotation, VideoAnnotation], None, None]:
"""
Streams a specific split of this dataset from the database. All yielded annotations will adhere to this
dataset's annotation template.
If `chunk` and `nchunks` are omitted, then the full split will be streamed. Otherwise, the split will be
broken into `nchunks` pieces, and only the chunk identified by `chunk` will be streamed.
"""
for droplet in self._endpoints.dataset.stream_split(
database_uid = self.database,
namespace = self.repository.namespace,
name = self.repository.name,
uid = self.uid,
split = split,
chunk = chunk,
nchunks = nchunks,
):
if isinstance(self.template, ImageAnnotationTemplate):
yield ImageAnnotation.from_json(droplet)
elif isinstance(self.template, VideoAnnotationTemplate): # type: ignore - isinstance is excessive
yield VideoAnnotation.from_json(droplet)
else:
raise ValueError(f"Unknown template kind: {type(self.template)}")
def get_stable_identifier(self) -> str:
return f"{self.repository.namespace}/{self.repository.name}:{self.uid}"
def __repr__(self) -> str:
return basic_repr(
"Dataset",
self.get_stable_identifier(),
database = self.database,
splits = self.splits
)
AnyDataset = Union[Dataset[ImageAnnotationTemplate], Dataset[VideoAnnotationTemplate]]
Classes
class Dataset (endpoints: ApiEndpoints, uid: str, *, database: str, repository: DatasetRepository, splits: List[str], template: Union[ImageAnnotationTemplate, VideoAnnotationTemplate])
-
Represents a concrete version of a dataset. Critically,
Dataset
s cannot be changed once they're created.For reproducable training, ensure that you store the specific
Dataset
used during training.Expand source code
class Dataset(Generic[T]): """ Represents a concrete version of a dataset. Critically, `Dataset`s cannot be changed once they're created. For reproducable training, ensure that you store the specific `Dataset` used during training. """ _endpoints: ApiEndpoints uid: str """ The UID of this `Dataset`. """ database: str """ The UID of the database in which this dataset lives. """ repository: DatasetRepository """ The repository this dataset belongs to. """ splits: List[str] """ A list of all the splits that this dataset has. By default, this will be `["training", "validation"]`. """ template: T """ The template that all annotations in this dataset version adhere to. """ @staticmethod def from_json(endpoints: ApiEndpoints, json: JsonDataset) -> AnyDataset: """ Creates a new `Dataset` from a `JsonDataset`. """ template_json = json["template"] template: Union[ImageAnnotationTemplate, VideoAnnotationTemplate] if template_json["kind"] == "ImageAnnotationTemplate": template = ImageAnnotationTemplate.from_json(template_json) elif template_json["kind"] == "VideoAnnotationTemplate": template = VideoAnnotationTemplate.from_json(template_json) else: raise ValueError(f"Unknown template kind: {template_json['kind']}") return Dataset( endpoints, uid = json["uid"], database = json["database"], repository = DatasetRepository.from_json(json["repository"]), splits = json["splits"], template = template ) def __init__( self, endpoints: ApiEndpoints, uid: str, *, database: str, repository: DatasetRepository, splits: List[str], template: Union[ImageAnnotationTemplate, VideoAnnotationTemplate] ): self._endpoints = endpoints self.uid = uid self.database = database self.repository = repository self.splits = splits self.template = template @overload def stream_split( self: Dataset[ImageAnnotationTemplate], split: str ) -> Generator[ImageAnnotation, None, None]: ... @overload def stream_split( self: Dataset[ImageAnnotationTemplate], split: str, chunk: int, nchunks: int ) -> Generator[ImageAnnotation, None, None]: ... @overload def stream_split( self: Dataset[VideoAnnotationTemplate], split: str ) -> Generator[VideoAnnotation, None, None]: ... @overload def stream_split( self: Dataset[VideoAnnotationTemplate], split: str, chunk: int, nchunks: int ) -> Generator[VideoAnnotation, None, None]: ... def stream_split( self, split: str, chunk: int = 0, nchunks: int = 1 ) -> Generator[Union[ImageAnnotation, VideoAnnotation], None, None]: """ Streams a specific split of this dataset from the database. All yielded annotations will adhere to this dataset's annotation template. If `chunk` and `nchunks` are omitted, then the full split will be streamed. Otherwise, the split will be broken into `nchunks` pieces, and only the chunk identified by `chunk` will be streamed. """ for droplet in self._endpoints.dataset.stream_split( database_uid = self.database, namespace = self.repository.namespace, name = self.repository.name, uid = self.uid, split = split, chunk = chunk, nchunks = nchunks, ): if isinstance(self.template, ImageAnnotationTemplate): yield ImageAnnotation.from_json(droplet) elif isinstance(self.template, VideoAnnotationTemplate): # type: ignore - isinstance is excessive yield VideoAnnotation.from_json(droplet) else: raise ValueError(f"Unknown template kind: {type(self.template)}") def get_stable_identifier(self) -> str: return f"{self.repository.namespace}/{self.repository.name}:{self.uid}" def __repr__(self) -> str: return basic_repr( "Dataset", self.get_stable_identifier(), database = self.database, splits = self.splits )
Ancestors
- typing.Generic
Class variables
var database : str
-
The UID of the database in which this dataset lives.
var repository : DatasetRepository
-
The repository this dataset belongs to.
var splits : List[str]
-
A list of all the splits that this dataset has. By default, this will be
["training", "validation"]
. var template : ~T
-
The template that all annotations in this dataset version adhere to.
var uid : str
-
The UID of this
Dataset
.
Static methods
def from_json(endpoints: ApiEndpoints, json: JsonDataset) ‑> Union[Dataset[ImageAnnotationTemplate], Dataset[VideoAnnotationTemplate]]
-
Creates a new
Dataset
from aJsonDataset
.Expand source code
@staticmethod def from_json(endpoints: ApiEndpoints, json: JsonDataset) -> AnyDataset: """ Creates a new `Dataset` from a `JsonDataset`. """ template_json = json["template"] template: Union[ImageAnnotationTemplate, VideoAnnotationTemplate] if template_json["kind"] == "ImageAnnotationTemplate": template = ImageAnnotationTemplate.from_json(template_json) elif template_json["kind"] == "VideoAnnotationTemplate": template = VideoAnnotationTemplate.from_json(template_json) else: raise ValueError(f"Unknown template kind: {template_json['kind']}") return Dataset( endpoints, uid = json["uid"], database = json["database"], repository = DatasetRepository.from_json(json["repository"]), splits = json["splits"], template = template )
Methods
def get_stable_identifier(self) ‑> str
-
Expand source code
def get_stable_identifier(self) -> str: return f"{self.repository.namespace}/{self.repository.name}:{self.uid}"
def stream_split(self, split: str, chunk: int = 0, nchunks: int = 1) ‑> Generator[Union[ImageAnnotation, VideoAnnotation], None, None]
-
Streams a specific split of this dataset from the database. All yielded annotations will adhere to this dataset's annotation template.
If
chunk
andnchunks
are omitted, then the full split will be streamed. Otherwise, the split will be broken intonchunks
pieces, and only the chunk identified bychunk
will be streamed.Expand source code
def stream_split( self, split: str, chunk: int = 0, nchunks: int = 1 ) -> Generator[Union[ImageAnnotation, VideoAnnotation], None, None]: """ Streams a specific split of this dataset from the database. All yielded annotations will adhere to this dataset's annotation template. If `chunk` and `nchunks` are omitted, then the full split will be streamed. Otherwise, the split will be broken into `nchunks` pieces, and only the chunk identified by `chunk` will be streamed. """ for droplet in self._endpoints.dataset.stream_split( database_uid = self.database, namespace = self.repository.namespace, name = self.repository.name, uid = self.uid, split = split, chunk = chunk, nchunks = nchunks, ): if isinstance(self.template, ImageAnnotationTemplate): yield ImageAnnotation.from_json(droplet) elif isinstance(self.template, VideoAnnotationTemplate): # type: ignore - isinstance is excessive yield VideoAnnotation.from_json(droplet) else: raise ValueError(f"Unknown template kind: {type(self.template)}")
class DatasetRepository (*, name: str, namespace: str)
-
An object representing the repository a dataset came from.
Expand source code
class DatasetRepository: """ An object representing the repository a dataset came from. """ name: str """ The name of the repository. """ namespace: str """ The namespace of the repository. """ @staticmethod def from_json(json: JsonDatasetRepository) -> DatasetRepository: """ Creates a new `DatasetRepository` from a `JsonDatasetRepository`. """ return DatasetRepository(name = json["name"], namespace = json["namespace"]) def __init__(self, *, name: str, namespace: str): self.name = name self.namespace = namespace
Class variables
var name : str
-
The name of the repository.
var namespace : str
-
The namespace of the repository.
Static methods
def from_json(json: JsonDatasetRepository) ‑> DatasetRepository
-
Creates a new
DatasetRepository
from aJsonDatasetRepository
.Expand source code
@staticmethod def from_json(json: JsonDatasetRepository) -> DatasetRepository: """ Creates a new `DatasetRepository` from a `JsonDatasetRepository`. """ return DatasetRepository(name = json["name"], namespace = json["namespace"])