Skip to content

beignet.datasets

beignet.datasets.FASTADataset

Bases: SizedSequenceDataset

Source code in src/beignet/datasets/_fasta_dataset.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
class FASTADataset(SizedSequenceDataset):
    def __init__(
        self,
        root: str | PathLike,
        *,
        transform: Callable | Transform | None = None,
    ):
        if isinstance(root, str):
            self.root = Path(root)

        self.root = self.root.resolve()

        if not self.root.exists():
            raise FileNotFoundError

        self.data = ThreadSafeFile(self.root, open)

        offsets = Path(f"{self.root}.offsets.npy")

        if offsets.exists():
            self.offsets, sizes = numpy.load(f"{offsets}")
        else:
            self.offsets, sizes = self._build_index()

            numpy.save(f"{offsets}", numpy.stack([self.offsets, sizes]))

        self.transform = transform

        super().__init__(self.root, sizes)

    def __getitem__(self, index: int) -> Tuple[str, str]:
        x = self.get(index)

        if self.transform:
            x = self.transform(x)

        return x

    def __len__(self) -> int:
        return self.offsets.size

    def get(self, index: int) -> (str, str):
        self.data.seek(self.offsets[index])

        if index == len(self) - 1:
            data = self.data.read()
        else:
            data = self.data.read(self.offsets[index + 1] - self.offsets[index])

        description, *sequence = data.split("\n")

        return "".join(sequence), description

    def _build_index(self) -> (numpy.ndarray, numpy.ndarray):
        with open(self.root, "r") as file:
            content = file.read()

        offsets, sizes = [], []

        current_offset, current_size = 0, 0

        parsing = False

        for sequence in tqdm.tqdm(content.splitlines(keepends=True)):
            characters = len(sequence)

            if sequence.startswith(">"):
                if parsing:
                    sizes = [*sizes, current_size]

                    current_size = 0

                offsets = [*offsets, current_offset]

                parsing = True
            elif parsing:
                current_size = current_size + len(sequence.rstrip("\n"))

            current_offset = current_offset + characters

        if parsing:
            sizes = [*sizes, current_size]

        offsets = numpy.array(offsets, dtype=numpy.int64)

        sizes = numpy.array(sizes, dtype=numpy.int64)

        return offsets, sizes

beignet.datasets.RandomEulerAngleDataset

Bases: RandomRotationDataset

Source code in src/beignet/datasets/_random_euler_angle_dataset.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
class RandomEulerAngleDataset(RandomRotationDataset):
    def __init__(
        self,
        size: int,
        axes: str,
        degrees: bool | None = False,
        *,
        device: torch.device | None = None,
        dtype: torch.dtype | None = None,
        generator: Generator | None = None,
        layout: torch.layout | None = torch.strided,
        pin_memory: bool | None = False,
        requires_grad: bool | None = False,
        transform: Callable | Transform | None = None,
    ):
        r"""
        Parameters
        ----------
        size : int
            Output size.

        axes : str
            Axes. 1-3 characters belonging to the set {‘X’, ‘Y’, ‘Z’} for
            intrinsic rotations, or {‘x’, ‘y’, ‘z’} for extrinsic rotations.
            Extrinsic and intrinsic rotations cannot be mixed.

        degrees : bool, optional
            If `True`, Euler angles are assumed to be in degrees. Default,
            `False`.

        generator : torch.Generator, optional
            Psuedo-random number generator. Default, `None`.

        dtype : torch.dtype, optional
            Type of the returned tensor. Default, global default.

        layout : torch.layout, optional
            Layout of the returned tensor. Default, `torch.strided`.

        device : torch.device, optional
            Device of the returned tensor. Default, current device for the
            default tensor type.

        requires_grad : bool, optional
            Whether autograd records operations on the returned tensor.
            Default, `False`.

        pin_memory : bool, optional
            If `True`, returned tensor is allocated in pinned memory. Default,
            `False`.
        """
        super().__init__(
            beignet.random_euler_angle(
                size,
                axes,
                degrees,
                generator=generator,
                dtype=dtype,
                layout=layout,
                device=device,
                requires_grad=requires_grad,
                pin_memory=pin_memory,
            ),
            transform=transform,
        )

__init__

__init__(size, axes, degrees=False, *, device=None, dtype=None, generator=None, layout=torch.strided, pin_memory=False, requires_grad=False, transform=None)

Parameters:

Name Type Description Default
size int

Output size.

required
axes str

Axes. 1-3 characters belonging to the set {‘X’, ‘Y’, ‘Z’} for intrinsic rotations, or {‘x’, ‘y’, ‘z’} for extrinsic rotations. Extrinsic and intrinsic rotations cannot be mixed.

required
degrees bool

If True, Euler angles are assumed to be in degrees. Default, False.

False
generator Generator

Psuedo-random number generator. Default, None.

None
dtype dtype

Type of the returned tensor. Default, global default.

None
layout layout

Layout of the returned tensor. Default, torch.strided.

strided
device device

Device of the returned tensor. Default, current device for the default tensor type.

None
requires_grad bool

Whether autograd records operations on the returned tensor. Default, False.

False
pin_memory bool

If True, returned tensor is allocated in pinned memory. Default, False.

False
Source code in src/beignet/datasets/_random_euler_angle_dataset.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def __init__(
    self,
    size: int,
    axes: str,
    degrees: bool | None = False,
    *,
    device: torch.device | None = None,
    dtype: torch.dtype | None = None,
    generator: Generator | None = None,
    layout: torch.layout | None = torch.strided,
    pin_memory: bool | None = False,
    requires_grad: bool | None = False,
    transform: Callable | Transform | None = None,
):
    r"""
    Parameters
    ----------
    size : int
        Output size.

    axes : str
        Axes. 1-3 characters belonging to the set {‘X’, ‘Y’, ‘Z’} for
        intrinsic rotations, or {‘x’, ‘y’, ‘z’} for extrinsic rotations.
        Extrinsic and intrinsic rotations cannot be mixed.

    degrees : bool, optional
        If `True`, Euler angles are assumed to be in degrees. Default,
        `False`.

    generator : torch.Generator, optional
        Psuedo-random number generator. Default, `None`.

    dtype : torch.dtype, optional
        Type of the returned tensor. Default, global default.

    layout : torch.layout, optional
        Layout of the returned tensor. Default, `torch.strided`.

    device : torch.device, optional
        Device of the returned tensor. Default, current device for the
        default tensor type.

    requires_grad : bool, optional
        Whether autograd records operations on the returned tensor.
        Default, `False`.

    pin_memory : bool, optional
        If `True`, returned tensor is allocated in pinned memory. Default,
        `False`.
    """
    super().__init__(
        beignet.random_euler_angle(
            size,
            axes,
            degrees,
            generator=generator,
            dtype=dtype,
            layout=layout,
            device=device,
            requires_grad=requires_grad,
            pin_memory=pin_memory,
        ),
        transform=transform,
    )

beignet.datasets.RandomQuaternionDataset

Bases: RandomRotationDataset

Source code in src/beignet/datasets/_random_quaternion_dataset.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
class RandomQuaternionDataset(RandomRotationDataset):
    def __init__(
        self,
        size: int,
        canonical: bool = False,
        *,
        device: torch.device | None = None,
        dtype: torch.dtype | None = None,
        generator: Generator | None = None,
        layout: torch.layout | None = torch.strided,
        pin_memory: bool | None = False,
        requires_grad: bool | None = False,
        transform: Callable | Transform | None = None,
    ):
        r"""
        Parameters
        ----------
        size : int
            Output size.

        canonical : bool, optional
            Whether to map the redundant double cover of rotation space to a
            unique canonical single cover. If `True`, then the rotation
            quaternion is chosen from :math:`{q, -q}` such that the :math:`w`
            term is positive. If the :math:`w` term is :math:`0`, then the
            rotation quaternion is chosen such that the first non-zero term of
            the :math:`x`, :math:`y`, and :math:`z` terms is positive.

        generator : torch.Generator, optional
            Psuedo-random number generator. Default, `None`.

        dtype : torch.dtype, optional
            Type of the returned tensor. Default, global default.

        layout : torch.layout, optional
            Layout of the returned tensor. Default, `torch.strided`.

        device : torch.device, optional
            Device of the returned tensor. Default, current device for the
            default tensor type.

        requires_grad : bool, optional
            Whether autograd records operations on the returned tensor.
            Default, `False`.

        pin_memory : bool, optional
            If `True`, returned tensor is allocated in pinned memory.
            Default, `False`.
        """
        super().__init__(
            beignet.random_quaternion(
                size,
                canonical,
                generator=generator,
                dtype=dtype,
                layout=layout,
                device=device,
                requires_grad=requires_grad,
                pin_memory=pin_memory,
            ),
            transform=transform,
        )

__init__

__init__(size, canonical=False, *, device=None, dtype=None, generator=None, layout=torch.strided, pin_memory=False, requires_grad=False, transform=None)

Parameters:

Name Type Description Default
size int

Output size.

required
canonical bool

Whether to map the redundant double cover of rotation space to a unique canonical single cover. If True, then the rotation quaternion is chosen from :math:{q, -q} such that the :math:w term is positive. If the :math:w term is :math:0, then the rotation quaternion is chosen such that the first non-zero term of the :math:x, :math:y, and :math:z terms is positive.

False
generator Generator

Psuedo-random number generator. Default, None.

None
dtype dtype

Type of the returned tensor. Default, global default.

None
layout layout

Layout of the returned tensor. Default, torch.strided.

strided
device device

Device of the returned tensor. Default, current device for the default tensor type.

None
requires_grad bool

Whether autograd records operations on the returned tensor. Default, False.

False
pin_memory bool

If True, returned tensor is allocated in pinned memory. Default, False.

False
Source code in src/beignet/datasets/_random_quaternion_dataset.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def __init__(
    self,
    size: int,
    canonical: bool = False,
    *,
    device: torch.device | None = None,
    dtype: torch.dtype | None = None,
    generator: Generator | None = None,
    layout: torch.layout | None = torch.strided,
    pin_memory: bool | None = False,
    requires_grad: bool | None = False,
    transform: Callable | Transform | None = None,
):
    r"""
    Parameters
    ----------
    size : int
        Output size.

    canonical : bool, optional
        Whether to map the redundant double cover of rotation space to a
        unique canonical single cover. If `True`, then the rotation
        quaternion is chosen from :math:`{q, -q}` such that the :math:`w`
        term is positive. If the :math:`w` term is :math:`0`, then the
        rotation quaternion is chosen such that the first non-zero term of
        the :math:`x`, :math:`y`, and :math:`z` terms is positive.

    generator : torch.Generator, optional
        Psuedo-random number generator. Default, `None`.

    dtype : torch.dtype, optional
        Type of the returned tensor. Default, global default.

    layout : torch.layout, optional
        Layout of the returned tensor. Default, `torch.strided`.

    device : torch.device, optional
        Device of the returned tensor. Default, current device for the
        default tensor type.

    requires_grad : bool, optional
        Whether autograd records operations on the returned tensor.
        Default, `False`.

    pin_memory : bool, optional
        If `True`, returned tensor is allocated in pinned memory.
        Default, `False`.
    """
    super().__init__(
        beignet.random_quaternion(
            size,
            canonical,
            generator=generator,
            dtype=dtype,
            layout=layout,
            device=device,
            requires_grad=requires_grad,
            pin_memory=pin_memory,
        ),
        transform=transform,
    )

beignet.datasets.RandomRotationMatrixDataset

Bases: RandomRotationDataset

Source code in src/beignet/datasets/_random_rotation_matrix_dataset.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
class RandomRotationMatrixDataset(RandomRotationDataset):
    def __init__(
        self,
        size: int,
        *,
        device: torch.device | None = None,
        dtype: torch.dtype | None = None,
        generator: Generator | None = None,
        layout: torch.layout | None = torch.strided,
        pin_memory: bool | None = False,
        requires_grad: bool | None = False,
        transform: Callable | Transform | None = None,
    ):
        r"""
        Parameters
        ----------
        size : int
            Output size.

        generator : torch.Generator, optional
            Psuedo-random number generator. Default, `None`.

        dtype : torch.dtype, optional
            Type of the returned tensor. Default, global default.

        layout : torch.layout, optional
            Layout of the returned tensor. Default, `torch.strided`.

        device : torch.device, optional
            Device of the returned tensor. Default, current device for the
            default tensor type.

        requires_grad : bool, optional
            Whether autograd records operations on the returned tensor.
            Default, `False`.

        pin_memory : bool, optional
            If `True`, returned tensor is allocated in pinned memory. Default,
            `False`.
        """
        super().__init__(
            beignet.random_rotation_matrix(
                size,
                generator=generator,
                dtype=dtype,
                layout=layout,
                device=device,
                requires_grad=requires_grad,
                pin_memory=pin_memory,
            ),
            transform=transform,
        )

__init__

__init__(size, *, device=None, dtype=None, generator=None, layout=torch.strided, pin_memory=False, requires_grad=False, transform=None)

Parameters:

Name Type Description Default
size int

Output size.

required
generator Generator

Psuedo-random number generator. Default, None.

None
dtype dtype

Type of the returned tensor. Default, global default.

None
layout layout

Layout of the returned tensor. Default, torch.strided.

strided
device device

Device of the returned tensor. Default, current device for the default tensor type.

None
requires_grad bool

Whether autograd records operations on the returned tensor. Default, False.

False
pin_memory bool

If True, returned tensor is allocated in pinned memory. Default, False.

False
Source code in src/beignet/datasets/_random_rotation_matrix_dataset.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def __init__(
    self,
    size: int,
    *,
    device: torch.device | None = None,
    dtype: torch.dtype | None = None,
    generator: Generator | None = None,
    layout: torch.layout | None = torch.strided,
    pin_memory: bool | None = False,
    requires_grad: bool | None = False,
    transform: Callable | Transform | None = None,
):
    r"""
    Parameters
    ----------
    size : int
        Output size.

    generator : torch.Generator, optional
        Psuedo-random number generator. Default, `None`.

    dtype : torch.dtype, optional
        Type of the returned tensor. Default, global default.

    layout : torch.layout, optional
        Layout of the returned tensor. Default, `torch.strided`.

    device : torch.device, optional
        Device of the returned tensor. Default, current device for the
        default tensor type.

    requires_grad : bool, optional
        Whether autograd records operations on the returned tensor.
        Default, `False`.

    pin_memory : bool, optional
        If `True`, returned tensor is allocated in pinned memory. Default,
        `False`.
    """
    super().__init__(
        beignet.random_rotation_matrix(
            size,
            generator=generator,
            dtype=dtype,
            layout=layout,
            device=device,
            requires_grad=requires_grad,
            pin_memory=pin_memory,
        ),
        transform=transform,
    )

beignet.datasets.RandomRotationVectorDataset

Bases: RandomRotationDataset

Source code in src/beignet/datasets/_random_rotation_vector_dataset.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
class RandomRotationVectorDataset(RandomRotationDataset):
    def __init__(
        self,
        size: int,
        degrees: bool = False,
        *,
        device: torch.device | None = None,
        dtype: torch.dtype | None = None,
        generator: Generator | None = None,
        layout: torch.layout | None = torch.strided,
        pin_memory: bool | None = False,
        requires_grad: bool | None = False,
        transform: Callable | Transform | None = None,
    ):
        r"""
        Parameters
        ----------
        size : int
            Output size.

        degrees : bool
            If `True`, rotation vector magnitudes are assumed to be in degrees.
            Default, `False`.

        generator : torch.Generator, optional
            Psuedo-random number generator. Default, `None`.

        dtype : torch.dtype, optional
            Type of the returned tensor. Default, global default.

        layout : torch.layout, optional
            Layout of the returned tensor. Default, `torch.strided`.

        device : torch.device, optional
            Device of the returned tensor. Default, current device for the
            default tensor type.

        requires_grad : bool, optional
            Whether autograd records operations on the returned tensor.
            Default, `False`.

        pin_memory : bool, optional
            If `True`, returned tensor is allocated in pinned memory. Default,
            `False`.
        """
        super().__init__(
            beignet.random_rotation_vector(
                size,
                degrees,
                generator=generator,
                dtype=dtype,
                layout=layout,
                device=device,
                requires_grad=requires_grad,
                pin_memory=pin_memory,
            ),
            transform=transform,
        )

__init__

__init__(size, degrees=False, *, device=None, dtype=None, generator=None, layout=torch.strided, pin_memory=False, requires_grad=False, transform=None)

Parameters:

Name Type Description Default
size int

Output size.

required
degrees bool

If True, rotation vector magnitudes are assumed to be in degrees. Default, False.

False
generator Generator

Psuedo-random number generator. Default, None.

None
dtype dtype

Type of the returned tensor. Default, global default.

None
layout layout

Layout of the returned tensor. Default, torch.strided.

strided
device device

Device of the returned tensor. Default, current device for the default tensor type.

None
requires_grad bool

Whether autograd records operations on the returned tensor. Default, False.

False
pin_memory bool

If True, returned tensor is allocated in pinned memory. Default, False.

False
Source code in src/beignet/datasets/_random_rotation_vector_dataset.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def __init__(
    self,
    size: int,
    degrees: bool = False,
    *,
    device: torch.device | None = None,
    dtype: torch.dtype | None = None,
    generator: Generator | None = None,
    layout: torch.layout | None = torch.strided,
    pin_memory: bool | None = False,
    requires_grad: bool | None = False,
    transform: Callable | Transform | None = None,
):
    r"""
    Parameters
    ----------
    size : int
        Output size.

    degrees : bool
        If `True`, rotation vector magnitudes are assumed to be in degrees.
        Default, `False`.

    generator : torch.Generator, optional
        Psuedo-random number generator. Default, `None`.

    dtype : torch.dtype, optional
        Type of the returned tensor. Default, global default.

    layout : torch.layout, optional
        Layout of the returned tensor. Default, `torch.strided`.

    device : torch.device, optional
        Device of the returned tensor. Default, current device for the
        default tensor type.

    requires_grad : bool, optional
        Whether autograd records operations on the returned tensor.
        Default, `False`.

    pin_memory : bool, optional
        If `True`, returned tensor is allocated in pinned memory. Default,
        `False`.
    """
    super().__init__(
        beignet.random_rotation_vector(
            size,
            degrees,
            generator=generator,
            dtype=dtype,
            layout=layout,
            device=device,
            requires_grad=requires_grad,
            pin_memory=pin_memory,
        ),
        transform=transform,
    )

beignet.datasets.SequenceDataset

Bases: Dataset

Source code in src/beignet/datasets/_sequence_dataset.py
 7
 8
 9
10
11
12
class SequenceDataset(Dataset):
    def __init__(self, root: str | PathLike, *args, **kwargs):
        if isinstance(root, str):
            root = Path(root)

        self.root = root.resolve()

beignet.datasets.SizedSequenceDataset

Bases: SequenceDataset

Source code in src/beignet/datasets/_sized_sequence_dataset.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
class SizedSequenceDataset(SequenceDataset):
    def __init__(
        self,
        root: str | PathLike,
        sizes: ArrayLike,
        *args,
        **kwargs,
    ):
        super().__init__(root, *args, **kwargs)

        self.sizes = sizes

    def __len__(self) -> int:
        return len(self.sizes)

beignet.datasets.SwissProtDataset

Bases: UniProtDataset

Source code in src/beignet/datasets/_swissprot_dataset.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
class SwissProtDataset(UniProtDataset):
    def __init__(
        self,
        root: str | PathLike | None = None,
        *,
        transform: Callable | Transform | None = None,
        target_transform: Callable | Transform | None = None,
    ):
        """
        Parameters
        ----------
        root : str | PathLike, optional
            Root directory where the dataset subdirectory exists or, if
            `download` is `True`, the directory where the dataset subdirectory
            will be created and the dataset downloaded.

        transform : Callable, optional
            A `Callable` or `Transform` that that maps a sequence to a
            transformed sequence (default: `None`).

        target_transform : Callable, optional
            A `Callable` or `Transform` that maps a target to a transformed
            target (default: `None`).
        """
        super().__init__(
            "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz",
            root,
            "md5:0766df3e5785fc5f1cfc496aa89e86ad",
            transform=transform,
            target_transform=target_transform,
        )

__init__

__init__(root=None, *, transform=None, target_transform=None)

Parameters:

Name Type Description Default
root str | PathLike

Root directory where the dataset subdirectory exists or, if download is True, the directory where the dataset subdirectory will be created and the dataset downloaded.

None
transform Callable

A Callable or Transform that that maps a sequence to a transformed sequence (default: None).

None
target_transform Callable

A Callable or Transform that maps a target to a transformed target (default: None).

None
Source code in src/beignet/datasets/_swissprot_dataset.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def __init__(
    self,
    root: str | PathLike | None = None,
    *,
    transform: Callable | Transform | None = None,
    target_transform: Callable | Transform | None = None,
):
    """
    Parameters
    ----------
    root : str | PathLike, optional
        Root directory where the dataset subdirectory exists or, if
        `download` is `True`, the directory where the dataset subdirectory
        will be created and the dataset downloaded.

    transform : Callable, optional
        A `Callable` or `Transform` that that maps a sequence to a
        transformed sequence (default: `None`).

    target_transform : Callable, optional
        A `Callable` or `Transform` that maps a target to a transformed
        target (default: `None`).
    """
    super().__init__(
        "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz",
        root,
        "md5:0766df3e5785fc5f1cfc496aa89e86ad",
        transform=transform,
        target_transform=target_transform,
    )

beignet.datasets.TrEMBLDataset

Bases: UniProtDataset

Source code in src/beignet/datasets/_trembl_dataset.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
class TrEMBLDataset(UniProtDataset):
    def __init__(
        self,
        root: str | PathLike | None = None,
        *,
        transform: Callable | Transform | None = None,
        target_transform: Callable | Transform | None = None,
    ):
        """
        Parameters
        ----------
        root : str | PathLike, optional
            Root directory where the dataset subdirectory exists or, if
            `download` is `True`, the directory where the dataset subdirectory
            will be created and the dataset downloaded.

        transform : Callable, optional
            A `Callable` or `Transform` that that maps a sequence to a
            transformed sequence (default: `None`).

        target_transform : Callable, optional
            A `Callable` or `Transform` that maps a target (a cluster
            identifier) to a transformed target (default: `None`).
        """
        super().__init__(
            "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz",
            root,
            "md5:56f0f20479a88d28fb51db7ef4df90ed",
            transform=transform,
            target_transform=target_transform,
        )

__init__

__init__(root=None, *, transform=None, target_transform=None)

Parameters:

Name Type Description Default
root str | PathLike

Root directory where the dataset subdirectory exists or, if download is True, the directory where the dataset subdirectory will be created and the dataset downloaded.

None
transform Callable

A Callable or Transform that that maps a sequence to a transformed sequence (default: None).

None
target_transform Callable

A Callable or Transform that maps a target (a cluster identifier) to a transformed target (default: None).

None
Source code in src/beignet/datasets/_trembl_dataset.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def __init__(
    self,
    root: str | PathLike | None = None,
    *,
    transform: Callable | Transform | None = None,
    target_transform: Callable | Transform | None = None,
):
    """
    Parameters
    ----------
    root : str | PathLike, optional
        Root directory where the dataset subdirectory exists or, if
        `download` is `True`, the directory where the dataset subdirectory
        will be created and the dataset downloaded.

    transform : Callable, optional
        A `Callable` or `Transform` that that maps a sequence to a
        transformed sequence (default: `None`).

    target_transform : Callable, optional
        A `Callable` or `Transform` that maps a target (a cluster
        identifier) to a transformed target (default: `None`).
    """
    super().__init__(
        "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz",
        root,
        "md5:56f0f20479a88d28fb51db7ef4df90ed",
        transform=transform,
        target_transform=target_transform,
    )

beignet.datasets.UniProtDataset

Bases: FASTADataset

Source code in src/beignet/datasets/_uniprot_dataset.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
class UniProtDataset(FASTADataset):
    def __init__(
        self,
        url: str,
        root: str | PathLike | None = None,
        known_hash: str | None = None,
        *,
        transform: Callable | Transform | None = None,
        target_transform: Callable | Transform | None = None,
    ):
        """
        Parameters
        ----------
        url : str
            URL to the file that needs to be downloaded. Ideally, the URL
            should end with a file name (e.g., `uniref50.fasta.gz`).

        root : str | PathLike, optional
            Root directory where the dataset subdirectory exists or, if
            `download` is `True`, the directory where the dataset subdirectory
            will be created and the dataset downloaded.

        transform : Callable | Transform, optional
            A `Callable` or `Transform` that that maps a sequence to a
            transformed sequence (default: `None`).

        target_transform : Callable | Transform, optional
            A `Callable` or `Transform` that maps a target (a cluster
            identifier) to a transformed target (default: `None`).
        """
        if root is None:
            root = pooch.os_cache("beignet")

        if isinstance(root, str):
            root = Path(root)

        self.root = root.resolve()

        name = self.__class__.__name__.replace("Dataset", "")

        super().__init__(
            pooch.retrieve(
                url,
                known_hash,
                f"{name}.fasta.gz",
                root / name,
                processor=Decompress(
                    name=f"{name}.fasta",
                ),
                progressbar=True,
            ),
        )

        self.transform = transform

        self.target_transform = target_transform

    def __getitem__(self, index: int) -> (str, str):
        input, target = self.get(index)

        if self.transform:
            input = self.transform(input)

        if self.target_transform:
            target = self.target_transform(target)

        return input, target

__init__

__init__(url, root=None, known_hash=None, *, transform=None, target_transform=None)

Parameters:

Name Type Description Default
url str

URL to the file that needs to be downloaded. Ideally, the URL should end with a file name (e.g., uniref50.fasta.gz).

required
root str | PathLike

Root directory where the dataset subdirectory exists or, if download is True, the directory where the dataset subdirectory will be created and the dataset downloaded.

None
transform Callable | Transform

A Callable or Transform that that maps a sequence to a transformed sequence (default: None).

None
target_transform Callable | Transform

A Callable or Transform that maps a target (a cluster identifier) to a transformed target (default: None).

None
Source code in src/beignet/datasets/_uniprot_dataset.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def __init__(
    self,
    url: str,
    root: str | PathLike | None = None,
    known_hash: str | None = None,
    *,
    transform: Callable | Transform | None = None,
    target_transform: Callable | Transform | None = None,
):
    """
    Parameters
    ----------
    url : str
        URL to the file that needs to be downloaded. Ideally, the URL
        should end with a file name (e.g., `uniref50.fasta.gz`).

    root : str | PathLike, optional
        Root directory where the dataset subdirectory exists or, if
        `download` is `True`, the directory where the dataset subdirectory
        will be created and the dataset downloaded.

    transform : Callable | Transform, optional
        A `Callable` or `Transform` that that maps a sequence to a
        transformed sequence (default: `None`).

    target_transform : Callable | Transform, optional
        A `Callable` or `Transform` that maps a target (a cluster
        identifier) to a transformed target (default: `None`).
    """
    if root is None:
        root = pooch.os_cache("beignet")

    if isinstance(root, str):
        root = Path(root)

    self.root = root.resolve()

    name = self.__class__.__name__.replace("Dataset", "")

    super().__init__(
        pooch.retrieve(
            url,
            known_hash,
            f"{name}.fasta.gz",
            root / name,
            processor=Decompress(
                name=f"{name}.fasta",
            ),
            progressbar=True,
        ),
    )

    self.transform = transform

    self.target_transform = target_transform

beignet.datasets.UniRef50Dataset

Bases: UniProtDataset

Source code in src/beignet/datasets/_uniref50_dataset.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
class UniRef50Dataset(UniProtDataset):
    def __init__(
        self,
        root: str | PathLike | None = None,
        *,
        transform: Callable | Transform | None = None,
        target_transform: Callable | Transform | None = None,
    ):
        """
        Parameters
        ----------
        root : str | PathLike, optional
            Root directory where the dataset subdirectory exists or, if
            `download` is `True`, the directory where the dataset subdirectory
            will be created and the dataset downloaded.

        transform : Callable, optional
            A `Callable` or `Transform` that that maps a sequence to a
            transformed sequence (default: `None`).

        target_transform : Callable, optional
            A `Callable` or `Transform` that maps a target (a cluster
            identifier) to a transformed target (default: `None`).
        """
        super().__init__(
            "http://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz",
            root,
            "md5:e638c63230d13ad5e2098115b9cb5d8f",
            transform=transform,
            target_transform=target_transform,
        )

__init__

__init__(root=None, *, transform=None, target_transform=None)

Parameters:

Name Type Description Default
root str | PathLike

Root directory where the dataset subdirectory exists or, if download is True, the directory where the dataset subdirectory will be created and the dataset downloaded.

None
transform Callable

A Callable or Transform that that maps a sequence to a transformed sequence (default: None).

None
target_transform Callable

A Callable or Transform that maps a target (a cluster identifier) to a transformed target (default: None).

None
Source code in src/beignet/datasets/_uniref50_dataset.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def __init__(
    self,
    root: str | PathLike | None = None,
    *,
    transform: Callable | Transform | None = None,
    target_transform: Callable | Transform | None = None,
):
    """
    Parameters
    ----------
    root : str | PathLike, optional
        Root directory where the dataset subdirectory exists or, if
        `download` is `True`, the directory where the dataset subdirectory
        will be created and the dataset downloaded.

    transform : Callable, optional
        A `Callable` or `Transform` that that maps a sequence to a
        transformed sequence (default: `None`).

    target_transform : Callable, optional
        A `Callable` or `Transform` that maps a target (a cluster
        identifier) to a transformed target (default: `None`).
    """
    super().__init__(
        "http://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz",
        root,
        "md5:e638c63230d13ad5e2098115b9cb5d8f",
        transform=transform,
        target_transform=target_transform,
    )

beignet.datasets.UniRef90Dataset

Bases: UniProtDataset

Source code in src/beignet/datasets/_uniref90_dataset.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
class UniRef90Dataset(UniProtDataset):
    def __init__(
        self,
        root: str | Path,
        *,
        transform: Callable | Transform | None = None,
        target_transform: Callable | Transform | None = None,
    ):
        r"""
        Parameters
        ----------
        root : str | Path
            Root directory where the dataset subdirectory exists or, if
            `download` is `True`, the directory where the dataset subdirectory
            will be created and the dataset downloaded.

        transform : Callable, optional
            A `Callable` or `Transform` that that maps a sequence to a
            transformed sequence (default: `None`).

        target_transform : Callable, optional
            A `Callable` or `Transform` that maps a target (a cluster
            identifier) to a transformed target (default: `None`).
        """
        super().__init__(
            "http://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz",
            root,
            "md5:6161bad4d7506365aee882fd5ff9c833",
            transform=transform,
            target_transform=target_transform,
        )

__init__

__init__(root, *, transform=None, target_transform=None)

Parameters:

Name Type Description Default
root str | Path

Root directory where the dataset subdirectory exists or, if download is True, the directory where the dataset subdirectory will be created and the dataset downloaded.

required
transform Callable

A Callable or Transform that that maps a sequence to a transformed sequence (default: None).

None
target_transform Callable

A Callable or Transform that maps a target (a cluster identifier) to a transformed target (default: None).

None
Source code in src/beignet/datasets/_uniref90_dataset.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def __init__(
    self,
    root: str | Path,
    *,
    transform: Callable | Transform | None = None,
    target_transform: Callable | Transform | None = None,
):
    r"""
    Parameters
    ----------
    root : str | Path
        Root directory where the dataset subdirectory exists or, if
        `download` is `True`, the directory where the dataset subdirectory
        will be created and the dataset downloaded.

    transform : Callable, optional
        A `Callable` or `Transform` that that maps a sequence to a
        transformed sequence (default: `None`).

    target_transform : Callable, optional
        A `Callable` or `Transform` that maps a target (a cluster
        identifier) to a transformed target (default: `None`).
    """
    super().__init__(
        "http://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz",
        root,
        "md5:6161bad4d7506365aee882fd5ff9c833",
        transform=transform,
        target_transform=target_transform,
    )

beignet.datasets.UniRef100Dataset

Bases: UniProtDataset

Source code in src/beignet/datasets/_uniref100_dataset.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
class UniRef100Dataset(UniProtDataset):
    def __init__(
        self,
        root: str | Path,
        *,
        transform: Callable | Transform | None = None,
        target_transform: Callable | Transform | None = None,
    ):
        r"""
        Parameters
        ----------
        root : str | Path
            Root directory where the dataset subdirectory exists or, if
            `download` is `True`, the directory where the dataset subdirectory
            will be created and the dataset downloaded.

        transform : Callable, optional
            A `Callable` or `Transform` that that maps a sequence to a
            transformed sequence (default: `None`).

        target_transform : Callable, optional
            A `Callable` or `Transform` that maps a target (a cluster
            identifier) to a transformed target (default: `None`).
        """
        super().__init__(
            "http://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.fasta.gz",
            root,
            "md5:0354240a56f4ca91ff426f8241cfeb7d",
            transform=transform,
            target_transform=target_transform,
        )

__init__

__init__(root, *, transform=None, target_transform=None)

Parameters:

Name Type Description Default
root str | Path

Root directory where the dataset subdirectory exists or, if download is True, the directory where the dataset subdirectory will be created and the dataset downloaded.

required
transform Callable

A Callable or Transform that that maps a sequence to a transformed sequence (default: None).

None
target_transform Callable

A Callable or Transform that maps a target (a cluster identifier) to a transformed target (default: None).

None
Source code in src/beignet/datasets/_uniref100_dataset.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def __init__(
    self,
    root: str | Path,
    *,
    transform: Callable | Transform | None = None,
    target_transform: Callable | Transform | None = None,
):
    r"""
    Parameters
    ----------
    root : str | Path
        Root directory where the dataset subdirectory exists or, if
        `download` is `True`, the directory where the dataset subdirectory
        will be created and the dataset downloaded.

    transform : Callable, optional
        A `Callable` or `Transform` that that maps a sequence to a
        transformed sequence (default: `None`).

    target_transform : Callable, optional
        A `Callable` or `Transform` that maps a target (a cluster
        identifier) to a transformed target (default: `None`).
    """
    super().__init__(
        "http://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.fasta.gz",
        root,
        "md5:0354240a56f4ca91ff426f8241cfeb7d",
        transform=transform,
        target_transform=target_transform,
    )