Source code for scimilarity.visualizations

from typing import List, Dict, Tuple, Optional


[docs]def aggregate_counts(data: "pandas.DataFrame", levels: List[str]) -> dict: """Aggregates cell counts on sample metadata and compiles it into circlify format. Parameters ---------- data: pandas.DataFrame A pandas dataframe containing sample metadata. levels: List[str] Specify the groupby columns for grouping the sample metadata. Returns ------- dict A circlify format dictionary containing grouped sample metadata. Examples -------- >>> circ_dict = aggregate_counts(sample_metadata, ["tissue", "disease"]) """ data_dict = {} for n in range(len(levels)): # construct a groupby dataframe to obtain counts columns = levels[0 : (n + 1)] df = ( data.groupby(columns, observed=True)[columns[0]] .count() .reset_index(name="count") ) # construct a nested dict to handle children levels for r in df.index: if n == 0: # top level data_dict[df.iloc[r, 0]] = {"datum": df.loc[r, "count"]} else: entry = data_dict[df.iloc[r, 0]] for c in range( 1, len(columns) ): # go through nested levels to find the deepest if ( "children" not in entry ): # create a child dict if it does not exist entry["children"] = {} entry = entry["children"] # go into child dict if df.iloc[r, c] in entry: # go into child dict entry if it exists entry = entry[df.iloc[r, c]] entry[df.iloc[r, c]] = { "datum": df.loc[r, "count"] } # create child entry return data_dict
[docs]def assign_size( data_dict: dict, data: "pandas.DataFrame", levels: List[str], size_column: str, name_column: str, ) -> dict: """Assigns circle sizes to a circlify format dictionary. Parameters ---------- data_dict: dict A circlify format dictionary. data: pandas.DataFrame A pandas dataframe containing sample metadata. levels: List[str] Specify the groupby columns for grouping the sample metadata. size_column: str The name of the column that will be used for circle size. name_column: str The name of the column that will be used for circle name. Returns ------- dict A circlify format dictionary. Examples -------- >>> circ_dict = assign_size(circ_dict, sample_metadata, ["tissue", "disease"], size_column="cells", name_column="study") """ df = data[levels + [size_column, name_column]] df = ( df.groupby(levels + [name_column], observed=True)[size_column] .sum() .reset_index(name="count") ) for ( r ) in ( df.index ): # find the deepest levels in data_dict and create an entry with (name, size) entry = data_dict[df.iloc[r, 0]] for c in range(1, len(levels)): entry = entry["children"][df.iloc[r, c]] if "children" not in entry: entry["children"] = {} entry["children"][df.loc[r, name_column]] = {"datum": df.loc[r, "count"]} return data_dict
[docs]def assign_suffix( data_dict: dict, data: "pandas.DataFrame", levels: List[str], suffix_column: str, name_column: str, ) -> dict: """Assigns circle name and suffix to a circlify format dictionary. Parameters ---------- data_dict: dict A circlify format dictionary. data: pandas.DataFrame A pandas dataframe containing sample metadata. levels: List[str] Specify the groupby columns for grouping the sample metadata. suffix_column: str The name of the column that will be used for the circle name suffix. name_column: str The name of the column that will be used for circle name. Returns ------- dict A circlify format dictionary. Examples -------- >>> circ_dict = assign_suffix(circ_dict, sample_metadata, ["tissue", "disease"], suffix_column="cells", name_column="study") """ df = data[levels + [suffix_column, name_column]] for r in df.index: # find the deepest levels in data_dict and rename with suffix entry = data_dict[df.iloc[r, 0]] for c in range(1, len(levels)): entry = entry["children"][df.iloc[r, c]] if df.loc[r, name_column] in entry["children"]: entry["children"][ f"{df.loc[r, name_column]}_{df.loc[r, suffix_column]}" ] = entry["children"].pop(df.loc[r, name_column]) return data_dict
[docs]def assign_colors( data_dict: dict, data: "pandas.DataFrame", levels: List[str], color_column: str, name_column: str, ) -> dict: """Assigns circle name and color to a circlify format dictionary. Parameters ---------- data_dict: dict A circlify format dictionary. data: pandas.DataFrame A pandas dataframe containing sample metadata. levels: List[str] Specify the groupby columns for grouping the sample metadata. color_column: str The name of the column that will be used for the circle color. name_column: str The name of the column that will be used for circle name. Returns ------- dict A circlify format dictionary. Examples -------- >>> circ_dict = assign_colors(circ_dict, sample_metadata, ["tissue", "disease"], color_column="cells", name_column="study") """ df = data[levels + [color_column, name_column]] for r in df.index: # find the deepest levels in data_dict and rename with color entry = data_dict[df.iloc[r, 0]] for c in range(1, len(levels)): entry = entry["children"][df.iloc[r, c]] if df.loc[r, name_column] in entry["children"]: entry["children"][df.loc[r, color_column]] = entry["children"].pop( df.loc[r, name_column] ) return data_dict
[docs]def get_children_data(data_dict: dict) -> List[dict]: """Recursively get all children data for a given circle. Parameters ---------- data_dict: dict A circlify format dictionary Returns ------- List[dict] A list of children data. Examples -------- >>> children = get_children_data(circ_dict[i]["children"]) """ child_data = [] for i in data_dict: # recursively get all children data entry = {"id": i, "datum": data_dict[i]["datum"]} if "children" in data_dict[i]: children = get_children_data(data_dict[i]["children"]) entry["children"] = children child_data.append(entry) return child_data
[docs]def circ_dict2data(circ_dict: dict) -> List[dict]: """Convert a circlify format dictionary to the list format expected by circlify. Parameters ---------- data_dict: dict A circlify format dictionary Returns ------- List[dict] A list of circle data. Examples -------- >>> circ_data = circ_dict2data(circ_dict) """ circ_data = [] for i in circ_dict: # convert dict to circlify list data entry = {"id": i, "datum": circ_dict[i]["datum"]} if "children" in circ_dict[i]: children = get_children_data(circ_dict[i]["children"]) entry["children"] = children circ_data.append(entry) return circ_data
[docs]def draw_circles( circ_data: List[dict], title: str = "", figsize: Tuple[int, int] = (10, 10), filename: Optional[str] = None, use_colormap: Optional[str] = None, use_suffix: Optional[dict] = None, use_suffix_as_color: bool = False, ): """Draw the circlify plot. Parameters ---------- circ_data: List[dict] A circlify format list. title: str, default: "" The figure title. figsize: Tuple[int, int], default: (10, 10) The figure size in inches. filename: str, optional, default: None Filename to save the figure. use_colormap: str, optional, default: None The colormap identifier. use_suffix: dict, optional, default: None A mapping of suffix to color using a dictionary in the form {suffix: float} use_suffix_as_color: bool, default: False Use the suffix as the color. This expects the suffix to be a float. Examples -------- >>> draw_circles(circ_data) """ try: import circlify as circ except: raise ImportError( "Package 'circlify' not found. Please install with 'pip install circlify'." ) import matplotlib.pyplot as plt import matplotlib as mpl mpl.rcParams["pdf.fonttype"] = 42 circles = circ.circlify(circ_data, show_enclosure=True) fig, ax = plt.subplots(figsize=figsize) if use_colormap: cmap = mpl.cm.get_cmap(use_colormap) ax.set_title(title) # title ax.axis("off") # remove axes # find axis boundaries lim = max( max(abs(circle.x) + circle.r, abs(circle.y) + circle.r) for circle in circles ) plt.xlim(-lim, lim) plt.ylim(-lim, lim) # 1st level: for circle in circles: if circle.level != 1: continue x, y, r = circle ax.add_patch( plt.Circle( (x, y), r, alpha=0.5, linewidth=1, facecolor="lightblue", edgecolor="black", ) ) # 2nd level: for circle in circles: if circle.level != 2: continue x, y, r = circle plt.annotate(circle.ex["id"], (x, y), ha="center", color="black") ax.add_patch( plt.Circle( (x, y), r, alpha=0.5, linewidth=1, facecolor="#69b3a2", edgecolor="black", ) ) # 3rd level: for circle in circles: if circle.level != 3: continue x, y, r = circle if use_colormap: if use_suffix: suffix = circle.ex["id"].split("_")[-1] color_fraction = use_suffix[suffix] elif use_suffix_as_color: suffix = circle.ex["id"].split("_")[-1] color_fraction = float(suffix) else: color_fraction = circle.ex["id"] ax.add_patch( plt.Circle( (x, y), r, alpha=1, linewidth=1, facecolor=cmap(color_fraction), edgecolor="white", ) ) else: ax.add_patch( plt.Circle( (x, y), r, alpha=0.5, linewidth=1, facecolor="red", edgecolor="white", ) ) # 1st level labels: for circle in circles: if circle.level != 1: continue x, y, r = circle label = circle.ex["id"] plt.annotate( label, (x, y), va="center", ha="center", bbox=dict(facecolor="white", edgecolor="black", boxstyle="round", pad=0.5), ) if filename: # save the figure fig.savefig(filename, bbox_inches="tight")
[docs]def hits_circles( metadata: "pandas.DataFrame", levels: list = ["tissue", "disease"], figsize: Tuple[int, int] = (10, 10), filename: Optional[str] = None, ): """Visualize sample metadata as circle plots for tissue and disease. Parameters ---------- metadata: pandas.DataFrame A pandas dataframe containing sample metadata for nearest neighbors with at least columns: ["study", "cells"], that represent the number of circles and circle size respectively. levels: list, default: ["tissue", "disease"] The columns to uses as group levels in the circles hierarchy. figsize: Tuple[int, int], default: (10, 10) Figure size, width x height filename: str, optional Filename to save the figure. Examples -------- >>> hits_circles(metadata) """ circ_dict = aggregate_counts(metadata, levels) circ_dict = assign_size( circ_dict, metadata, levels, size_column="cells", name_column="study" ) circ_data = circ_dict2data(circ_dict) draw_circles(circ_data, figsize=figsize, filename=filename)
[docs]def hits_heatmap( sample_metadata: Dict[str, "pandas.DataFrame"], x: str, y: str, count_type: str = "cells", figsize: Tuple[int, int] = (10, 10), filename: Optional[str] = None, ): """Visualize a list of sample metadata objects as a heatmap. Parameters ---------- sample_metadata: Dict[str, pandas.DataFrame] A dict where keys are cluster names and values are pandas dataframes containing sample metadata for each cluster centroid with columns: ["tissue", "disease", "study", "sample"]. x: str x-axis label key. This corresponds to cluster name values. y: str y-axis label key. This corresponds to the dataframe column to visualize. count_type: {"cells", "fraction"}, default: "cells" Count type to color in the heatmap. figsize: Tuple[int, int], default: (10, 10) Figure size, width x height filename: str, optional Filename to save the figure. Examples -------- >>> hits_heatmap(sample_metadata, "time", "disease") """ import matplotlib.pyplot as plt import matplotlib as mpl import numpy as np import pandas as pd import seaborn as sns mpl.rcParams["pdf.fonttype"] = 42 valid_count_types = {"cells", "fraction"} if count_type not in valid_count_types: raise ValueError( f"Unknown count_type {count_type}. Options are {valid_count_types}." ) for k in sample_metadata: sample_metadata[k][x] = k df = pd.concat(sample_metadata).reset_index(drop=True) if count_type == "cells": df_m = ( df.groupby([x, y], observed=True)["cells"].sum().unstack(level=0).fillna(0) ) else: df_m = ( df.groupby([x, y], observed=True)["fraction"] .mean() .unstack(level=0) .fillna(0) ) fig, ax = plt.subplots(figsize=figsize) sns.heatmap( ax=ax, data=df_m, xticklabels=True, yticklabels=True, square=True, cmap="Blues", linewidth=0.01, linecolor="gray", cbar_kws={"shrink": 0.5}, ) plt.tick_params(axis="both", labelsize=8, grid_alpha=0.0) # xticks ax.xaxis.tick_top() plt.xticks(np.arange(len(sample_metadata)) + 0.5, rotation=90) # axis labels plt.xlabel("") plt.ylabel("") # cbar font cbar = ax.collections[0].colorbar cbar.ax.tick_params(labelsize=6) if filename: # save the figure fig.savefig(filename, bbox_inches="tight")