Source code for decima.core.metadata
from typing import List, Optional
from dataclasses import dataclass
import pandas as pd
[docs]
@dataclass
class GeneMetadata:
"""Metadata for a gene in the dataset.
Attributes:
name: Gene name
chrom: Chromosome where the gene is located
start: Start position of the region around the gene to perform predictions in the chromosome
end: End position of the region around the gene to perform predictions in the chromosome
strand: Strand orientation (+ or -)
gene_type: Type of gene (e.g., protein_coding)
frac_nan: Fraction of NaN values
mean_counts: Mean count across samples
n_tracks: Number of tracks
gene_start: Gene start position
gene_end: Gene end position
gene_length: Length of the gene
gene_mask_start: Start position of the gene mask
gene_mask_end: End position of the gene mask
frac_N: Fraction of N bases
fold: Cross-validation fold
dataset: Dataset identifier
gene_id: Ensembl gene ID
pearson: Pearson correlation
size_factor_pearson: Size factor Pearson correlation
"""
name: str
chrom: str
start: int
end: int
strand: str
gene_type: str
frac_nan: float
mean_counts: float
n_tracks: int
gene_start: int
gene_end: int
gene_length: int
gene_mask_start: int
gene_mask_end: int
frac_N: float
fold: List[str]
dataset: str
gene_id: str
pearson: float
size_factor_pearson: float
[docs]
@classmethod
def from_series(cls, name: str, series: pd.Series) -> "GeneMetadata":
"""Create GeneMetadata from a pandas Series."""
data = series.to_dict()
data["name"] = name
data["fold"] = [f.strip() for f in data["fold"].strip("[]").replace("'", "").split(",")]
return cls(**data)
[docs]
@dataclass
class CellMetadata:
"""Metadata for a cell in the dataset.
Attributes:
name: Cell identifier
cell_type: Detailed cell type
tissue: Tissue identifier
organ: Organ name
disease: Disease state
study: Study identifier
dataset: Dataset identifier
region: Anatomical region
subregion: Anatomical subregion
celltype_coarse: Coarse cell type classification
n_cells: Number of cells
total_counts: Total count of transcripts
n_genes: Number of genes detected
size_factor: Size normalization factor
train_pearson: Pearson correlation in training set
val_pearson: Pearson correlation in validation set
test_pearson: Pearson correlation in test set
"""
name: str
cell_type: str
tissue: str
organ: str
disease: str
study: str
dataset: str
region: Optional[str]
subregion: Optional[str]
celltype_coarse: Optional[str]
n_cells: int
total_counts: float
n_genes: int
size_factor: float
train_pearson: float
val_pearson: float
test_pearson: float
[docs]
@classmethod
def from_series(cls, name: str, series: pd.Series) -> "CellMetadata":
"""Create CellMetadata from a pandas Series."""
data = series.to_dict()
data["name"] = name
return cls(**data)