Source code for velocyto.logic

from typing import *
import velocyto as vcy
import numpy as np
import abc


[docs]class Logic(metaclass=abc.ABCMeta): """Base class from wich all the logics should inherit """ def __init__(self) -> None: self.name = "Logic" @property def layers(self) -> List[str]: # This should be overridden if a different set of layers is desired return [] @property def stranded(self) -> bool: return True @property def perform_validation_markup(self) -> bool: return True @property def accept_discordant(self) -> bool: return False
[docs] @abc.abstractmethod # This needs to be overridden def count(self, molitem: vcy.Molitem, cell_bcidx: int, dict_layers_columns: Dict[str, np.ndarray], geneid2ix: Dict[str, int]) -> Union[None, int]: """This methods will have to countain the core operations of the logic to attribute a molecule to one of the cathergories Arguments --------- molitem: vcy.Molitem The :py:class:`vcy.Molitem` object to be considered by the logic cell_bcidx: int The cell index in the memory buffers below dict_layers_columns: Dict[str, np.ndarray] A dictionary mapping the name of a layer with the memory buffer that will be saved in the loom file after counting geneid2ix: Dict[str, int] Dictionary containing the Acession of the genes mapping to its column index position Returns ------- Nothing but it adds the molecule to the appropriate layer (or does not count at all) """ # NOTE I need to generalize this to any set of layers return None
[docs]class Permissive10X(Logic): """Permissive logic for 10X Genomics chemistry This logic differs from the other 10x Logics because: - singletons if the fall in not validated introns are COUNTED UNSPLICED - singletons if the fall in validated introns are COUNTED UNSPLICED - non-singletons if are supported by not validated introns are COUNTED UNSPLICED - non-singletons if are supported by validated introns are COUNTED UNSPLICED """ def __init__(self) -> None: self.name = "Permissive10X" @property def layers(self) -> List[str]: # This should be overridden if a different set of layers is desired return ["spliced", "unspliced", "ambiguous"] @property def stranded(self) -> bool: return True @property def perform_validation_markup(self) -> bool: return True @property def accept_discordant(self) -> bool: return False
[docs] def count(self, molitem: vcy.Molitem, cell_bcidx: int, dict_layers_columns: Dict[str, np.ndarray], geneid2ix: Dict[str, int]) -> int: # NOTE This can be simplified qyuite a bit, without loss of acuracy! # The hits are not compatible with any annotated transcript model spliced = dict_layers_columns["spliced"] unspliced = dict_layers_columns["unspliced"] ambiguous = dict_layers_columns["ambiguous"] # The hits are not compatible with any annotated transcript model if len(molitem.mappings_record) == 0: return 2 # Compatible with one or more transcript models: else: # Check that there are not different possible genes ?? if len(set(i.geneid for i in molitem.mappings_record.keys())) == 1: gene_check: Set[str] = set() has_onlyintron_model = 0 has_only_span_exin_model = 1 has_onlyintron_and_valid_model = 0 has_valid_mixed_model = 0 has_invalid_mixed_model = 0 has_onlyexo_model = 0 has_mixed_model = 0 multi_gene = 0 for transcript_model, segments_list in molitem.mappings_record.items(): gene_check.add(transcript_model.geneid) if len(gene_check) > 1: multi_gene = 1 has_introns = 0 has_exons = 0 has_exseg_with_spliced_flag = 0 has_validated_intron = 0 has_exin_intron_span = 0 has_non3prime = 0 for segment_match in segments_list: if segment_match.maps_to_intron: has_introns = 1 if segment_match.feature.is_validated: has_validated_intron = 1 if segment_match.feature.end_overlaps_with_part_of(segment_match.segment): downstream_exon = segment_match.feature.get_downstream_exon() if downstream_exon.start_overlaps_with_part_of(segment_match.segment): has_exin_intron_span = 1 if segment_match.feature.start_overlaps_with_part_of(segment_match.segment): upstream_exon = segment_match.feature.get_upstream_exon() if upstream_exon.end_overlaps_with_part_of(segment_match.segment): has_exin_intron_span = 1 elif segment_match.maps_to_exon: has_exons = 1 if not segment_match.feature.is_last_3prime: has_non3prime = 1 if segment_match.is_spliced: has_exseg_with_spliced_flag = 1 if has_validated_intron and not has_exons: has_onlyintron_and_valid_model = 1 if has_introns and not has_exons: has_onlyintron_model = 1 if has_exons and not has_introns: has_onlyexo_model = 1 if has_exons and has_introns and not has_validated_intron and not has_exin_intron_span: has_invalid_mixed_model = 1 has_mixed_model = 1 if has_exons and has_introns and has_validated_intron and not has_exin_intron_span: has_valid_mixed_model = 1 has_mixed_model = 1 if not has_exin_intron_span: has_only_span_exin_model = 0 if multi_gene: # Many genes are compatible with the observation, do not count return 1 else: if not len(molitem.mappings_record): # No gene is compatible with the observation, do not count return 2 else: if has_onlyexo_model and not has_onlyintron_model and not has_mixed_model: # More common situation, normal exonic read, count as spliced gene_ix = geneid2ix[transcript_model.geneid] spliced[gene_ix, cell_bcidx] += 1 return 0 if has_only_span_exin_model: # All the compatible transcript models have spanning exon-intron boundaries, count unspliced gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return 0 if has_onlyintron_and_valid_model and not has_mixed_model and not has_onlyexo_model: if len(segments_list) == 1: # Singleton in validated intron gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return 0 else: # Non-singleton in validated intron gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return 0 if has_onlyintron_model and not has_onlyintron_and_valid_model and not has_mixed_model and not has_onlyexo_model: if len(segments_list) == 1: # Singleton in non-validated intron gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return 0 else: # Non-singleton in non-validated intron gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return 0 if has_invalid_mixed_model and not has_valid_mixed_model and not has_onlyintron_model and not has_onlyexo_model and not has_only_span_exin_model: # Not validated and mapping to exon and introns, happens rarely in 10X / irrelevant. Count anyways gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return 0 if has_valid_mixed_model and not has_onlyintron_model and not has_onlyexo_model and not has_only_span_exin_model: # Validated and mapping to exon and introns, happens rarely in 10X. Count as unspliced. gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return 0 if has_onlyintron_model and has_onlyexo_model and not has_mixed_model: # Ambiguity among the transcript models compatible with the mapping, most common case! Count ambiguous gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return 0 if has_onlyintron_model and not has_onlyexo_model and has_mixed_model: # Ambiguity among the transcript models compatible with the mapping. Very rare. Count ambiguous gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return 0 if not has_onlyintron_model and has_onlyexo_model and has_mixed_model: # Ambiguity among the transcript models compatible with the mapping. Count ambiguous gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return 0 if has_onlyintron_model and has_onlyexo_model and has_mixed_model: # Ambiguity among the transcript models compatible with the mapping. Very rare. Count ambiguous gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return 3
[docs]class Intermediate10X(Logic): """ValidatedIntrons logic for 10X Genomics chemistry This differs from the other 10x Logics because: - singletons if the fall in not validated introns are DISCARDED - singletons if the fall in validated introns are COUNTED UNSPLICED - non-singletons if are supported by not validated introns are COUNTED UNSPLICED - non-singletons if are supported by validated introns are COUNTED UNSPLICED """ def __init__(self) -> None: self.name = "Intermediate10X" @property def layers(self) -> List[str]: # This should be overridden if a different set of layers is desired return ["spliced", "unspliced", "ambiguous"] @property def stranded(self) -> bool: return True @property def perform_validation_markup(self) -> bool: return True @property def accept_discordant(self) -> bool: return False
[docs] def count(self, molitem: vcy.Molitem, cell_bcidx: int, dict_layers_columns: Dict[str, np.ndarray], geneid2ix: Dict[str, int]) -> None: # NOTE This can be simplified qyuite a bit, without loss of acuracy! # The hits are not compatible with any annotated transcript model spliced = dict_layers_columns["spliced"] unspliced = dict_layers_columns["unspliced"] ambiguous = dict_layers_columns["ambiguous"] # The hits are not compatible with any annotated transcript model if len(molitem.mappings_record) == 0: return # Compatible with one or more transcript models: else: # Check that there are not different possible genes ?? if len(set(i.geneid for i in molitem.mappings_record.keys())) == 1: gene_check: Set[str] = set() has_onlyintron_model = 0 has_only_span_exin_model = 1 has_onlyintron_and_valid_model = 0 has_valid_mixed_model = 0 has_invalid_mixed_model = 0 has_onlyexo_model = 0 has_mixed_model = 0 multi_gene = 0 for transcript_model, segments_list in molitem.mappings_record.items(): gene_check.add(transcript_model.geneid) if len(gene_check) > 1: multi_gene = 1 has_introns = 0 has_exons = 0 has_exseg_with_spliced_flag = 0 has_validated_intron = 0 has_exin_intron_span = 0 has_non3prime = 0 for segment_match in segments_list: if segment_match.maps_to_intron: has_introns = 1 if segment_match.feature.is_validated: has_validated_intron = 1 if segment_match.feature.end_overlaps_with_part_of(segment_match.segment): downstream_exon = segment_match.feature.get_downstream_exon() if downstream_exon.start_overlaps_with_part_of(segment_match.segment): has_exin_intron_span = 1 if segment_match.feature.start_overlaps_with_part_of(segment_match.segment): upstream_exon = segment_match.feature.get_upstream_exon() if upstream_exon.end_overlaps_with_part_of(segment_match.segment): has_exin_intron_span = 1 elif segment_match.maps_to_exon: has_exons = 1 if not segment_match.feature.is_last_3prime: has_non3prime = 1 if segment_match.is_spliced: has_exseg_with_spliced_flag = 1 if has_validated_intron and not has_exons: has_onlyintron_and_valid_model = 1 if has_introns and not has_exons: has_onlyintron_model = 1 if has_exons and not has_introns: has_onlyexo_model = 1 if has_exons and has_introns and not has_validated_intron and not has_exin_intron_span: has_invalid_mixed_model = 1 has_mixed_model = 1 if has_exons and has_introns and has_validated_intron and not has_exin_intron_span: has_valid_mixed_model = 1 has_mixed_model = 1 if not has_exin_intron_span: has_only_span_exin_model = 0 if multi_gene: # many genes are compatible with the observation, do not count return else: if not len(molitem.mappings_record): # no gene is compatible with the observation, do not count return else: if has_onlyexo_model and not has_onlyintron_model and not has_mixed_model: # More common situation, normal exonic read, count as spliced gene_ix = geneid2ix[transcript_model.geneid] spliced[gene_ix, cell_bcidx] += 1 return if has_only_span_exin_model: # All the compatible transcript models have spanning exon-intron boundaries, count unspliced gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return if has_onlyintron_and_valid_model and not has_mixed_model and not has_onlyexo_model: if len(segments_list) == 1: # Singleton in validated intron gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return else: # Non singleton in validated intron gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return if has_onlyintron_model and not has_onlyintron_and_valid_model and not has_mixed_model and not has_onlyexo_model: if len(segments_list) == 1: # Singleton in non-validated intron return else: # Non-singleton in non-validated intron gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return if has_invalid_mixed_model and not has_valid_mixed_model and not has_onlyintron_model and not has_onlyexo_model and not has_only_span_exin_model: # Not validated and mapping to exon and introns, happens rarely in 10X / irrelevant. return if has_valid_mixed_model and not has_onlyintron_model and not has_onlyexo_model and not has_only_span_exin_model: # Validated and mapping to exon and introns, happens rarely in 10X. Count as unspliced. gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return if has_onlyintron_model and has_onlyexo_model and not has_mixed_model: # Ambiguity among the transcript models compatible with the mapping, most common case! Count ambiguous gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return if has_onlyintron_model and not has_onlyexo_model and has_mixed_model: # Ambiguity among the transcript models compatible with the mapping. Very rare. gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return if not has_onlyintron_model and has_onlyexo_model and has_mixed_model: # Ambiguity among the transcript models compatible with the mapping. Count ambiguous gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return if has_onlyintron_model and has_onlyexo_model and has_mixed_model: # Ambiguity among the transcript models compatible with the mapping. Very rare. gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return
[docs]class ValidatedIntrons10X(Logic): """ValidatedIntrons logic for 10X Genomics chemistry This differs from the other 10x Logics because: - singletons if the fall in not validated introns are DISCARDED - singletons if the fall in validated introns are COUNTED UNSPLICED - non-singletons if are supported by not validated introns are DISCARDED - non-singletons if are supported by validated introns are COUNTED UNSPLICED """ def __init__(self) -> None: self.name = "ValidatedIntrons10X" @property def layers(self) -> List[str]: # This should be overridden if a different set of layers is desired return ["spliced", "unspliced", "ambiguous"] @property def stranded(self) -> bool: return True @property def perform_validation_markup(self) -> bool: return True @property def accept_discordant(self) -> bool: return False
[docs] def count(self, molitem: vcy.Molitem, cell_bcidx: int, dict_layers_columns: Dict[str, np.ndarray], geneid2ix: Dict[str, int]) -> None: # NOTE This can be simplified qyuite a bit, without loss of acuracy! # The hits are not compatible with any annotated transcript model spliced = dict_layers_columns["spliced"] unspliced = dict_layers_columns["unspliced"] ambiguous = dict_layers_columns["ambiguous"] # The hits are not compatible with any annotated transcript model if len(molitem.mappings_record) == 0: return # Compatible with one or more transcript models: else: # Check that there are not different possible genes ?? if len(set(i.geneid for i in molitem.mappings_record.keys())) == 1: gene_check: Set[str] = set() has_onlyintron_model = 0 has_only_span_exin_model = 1 has_onlyintron_and_valid_model = 0 has_valid_mixed_model = 0 has_invalid_mixed_model = 0 has_onlyexo_model = 0 has_mixed_model = 0 multi_gene = 0 for transcript_model, segments_list in molitem.mappings_record.items(): gene_check.add(transcript_model.geneid) if len(gene_check) > 1: multi_gene = 1 has_introns = 0 has_exons = 0 has_exseg_with_spliced_flag = 0 has_validated_intron = 0 has_exin_intron_span = 0 has_non3prime = 0 for segment_match in segments_list: if segment_match.maps_to_intron: has_introns = 1 if segment_match.feature.is_validated: has_validated_intron = 1 if segment_match.feature.end_overlaps_with_part_of(segment_match.segment): downstream_exon = segment_match.feature.get_downstream_exon() if downstream_exon.start_overlaps_with_part_of(segment_match.segment): has_exin_intron_span = 1 if segment_match.feature.start_overlaps_with_part_of(segment_match.segment): upstream_exon = segment_match.feature.get_upstream_exon() if upstream_exon.end_overlaps_with_part_of(segment_match.segment): has_exin_intron_span = 1 elif segment_match.maps_to_exon: has_exons = 1 if not segment_match.feature.is_last_3prime: has_non3prime = 1 if segment_match.is_spliced: has_exseg_with_spliced_flag = 1 if has_validated_intron and not has_exons: has_onlyintron_and_valid_model = 1 if has_introns and not has_exons: has_onlyintron_model = 1 if has_exons and not has_introns: has_onlyexo_model = 1 if has_exons and has_introns and not has_validated_intron and not has_exin_intron_span: has_invalid_mixed_model = 1 has_mixed_model = 1 if has_exons and has_introns and has_validated_intron and not has_exin_intron_span: has_valid_mixed_model = 1 has_mixed_model = 1 if not has_exin_intron_span: has_only_span_exin_model = 0 if multi_gene: # many genes are compatible with the observation, do not count return else: if not len(molitem.mappings_record): # no gene is compatible with the observation, do not count return else: if has_onlyexo_model and not has_onlyintron_model and not has_mixed_model: # More common situation, normal exonic read, count as spliced gene_ix = geneid2ix[transcript_model.geneid] spliced[gene_ix, cell_bcidx] += 1 return if has_only_span_exin_model: # All the compatible transcript models have spanning exon-intron boundaries, count unspliced gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return if has_onlyintron_and_valid_model and not has_mixed_model and not has_onlyexo_model: if len(segments_list) == 1: # Singleton in validated intron gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return else: # Non singleton in validated intron gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return if has_onlyintron_model and not has_onlyintron_and_valid_model and not has_mixed_model and not has_onlyexo_model: if len(segments_list) == 1: # Singleton in non-validated intron return else: # Non-singleton in non-validated intron return if has_invalid_mixed_model and not has_valid_mixed_model and not has_onlyintron_model and not has_onlyexo_model and not has_only_span_exin_model: # Not validated and mapping to exon and introns, happens rarely in 10X / irrelevant. return if has_valid_mixed_model and not has_onlyintron_model and not has_onlyexo_model and not has_only_span_exin_model: # Validated and mapping to exon and introns, happens rarely in 10X. Count as unspliced. gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return if has_onlyintron_model and has_onlyexo_model and not has_mixed_model: # Ambiguity among the transcript models compatible with the mapping, most common case! Count ambiguous gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return if has_onlyintron_model and not has_onlyexo_model and has_mixed_model: # Ambiguity among the transcript models compatible with the mapping. Very rare. gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return if not has_onlyintron_model and has_onlyexo_model and has_mixed_model: # Ambiguity among the transcript models compatible with the mapping. Count ambiguous gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return if has_onlyintron_model and has_onlyexo_model and has_mixed_model: # Ambiguity among the transcript models compatible with the mapping. Very rare. gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return
[docs]class Stricter10X(Logic): """Stricter logic for 10X Genomics chemistry This differ from the other 10x Logics because: - singletons if the fall in not validated introns are DISCARDED - singletons if the fall in validated introns are DISCARDED - non-singletons if are supported by not validated introns are DISCARDED - non-singletons if are supported by validated introns are COUNTED UNSPLICED """ def __init__(self) -> None: self.name = "Stricter10X" @property def layers(self) -> List[str]: # This should be overridden if a different set of layers is desired return ["spliced", "unspliced", "ambiguous"] @property def stranded(self) -> bool: return True @property def perform_validation_markup(self) -> bool: return True
[docs] def count(self, molitem: vcy.Molitem, cell_bcidx: int, dict_layers_columns: Dict[str, np.ndarray], geneid2ix: Dict[str, int]) -> None: # NOTE This can be simplified qyuite a bit, without loss of acuracy! # The hits are not compatible with any annotated transcript model spliced = dict_layers_columns["spliced"] unspliced = dict_layers_columns["unspliced"] ambiguous = dict_layers_columns["ambiguous"] # The hits are not compatible with any annotated transcript model if len(molitem.mappings_record) == 0: return # Compatible with one or more transcript models: else: # Check that there are not different possible genes ?? if len(set(i.geneid for i in molitem.mappings_record.keys())) == 1: gene_check: Set[str] = set() has_onlyintron_model = 0 has_only_span_exin_model = 1 has_onlyintron_and_valid_model = 0 has_valid_mixed_model = 0 has_invalid_mixed_model = 0 has_onlyexo_model = 0 has_mixed_model = 0 multi_gene = 0 for transcript_model, segments_list in molitem.mappings_record.items(): gene_check.add(transcript_model.geneid) if len(gene_check) > 1: multi_gene = 1 has_introns = 0 has_exons = 0 has_exseg_with_spliced_flag = 0 has_validated_intron = 0 has_exin_intron_span = 0 has_non3prime = 0 for segment_match in segments_list: if segment_match.maps_to_intron: has_introns = 1 if segment_match.feature.is_validated: has_validated_intron = 1 if segment_match.feature.end_overlaps_with_part_of(segment_match.segment): downstream_exon = segment_match.feature.get_downstream_exon() if downstream_exon.start_overlaps_with_part_of(segment_match.segment): has_exin_intron_span = 1 if segment_match.feature.start_overlaps_with_part_of(segment_match.segment): upstream_exon = segment_match.feature.get_upstream_exon() if upstream_exon.end_overlaps_with_part_of(segment_match.segment): has_exin_intron_span = 1 elif segment_match.maps_to_exon: has_exons = 1 if not segment_match.feature.is_last_3prime: has_non3prime = 1 if segment_match.is_spliced: has_exseg_with_spliced_flag = 1 if has_validated_intron and not has_exons: has_onlyintron_and_valid_model = 1 if has_introns and not has_exons: has_onlyintron_model = 1 if has_exons and not has_introns: has_onlyexo_model = 1 if has_exons and has_introns and not has_validated_intron and not has_exin_intron_span: has_invalid_mixed_model = 1 has_mixed_model = 1 if has_exons and has_introns and has_validated_intron and not has_exin_intron_span: has_valid_mixed_model = 1 has_mixed_model = 1 if not has_exin_intron_span: has_only_span_exin_model = 0 if multi_gene: # many genes are compatible with the observation, do not count return else: if not len(molitem.mappings_record): # no gene is compatible with the observation, do not count return else: if has_onlyexo_model and not has_onlyintron_model and not has_mixed_model: # More common situation, normal exonic read, count as spliced gene_ix = geneid2ix[transcript_model.geneid] spliced[gene_ix, cell_bcidx] += 1 return if has_only_span_exin_model: # All the compatible transcript models have spanning exon-intron boundaries, count unspliced gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return if has_onlyintron_and_valid_model and not has_mixed_model and not has_onlyexo_model: if len(segments_list) == 1: # Singleton in validated intron, do not count return else: # Non singleton in validated intron gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return if has_onlyintron_model and not has_onlyintron_and_valid_model and not has_mixed_model and not has_onlyexo_model: if len(segments_list) == 1: # Singleton in non-validated intron return else: # Non-singleton in non-validated intron return if has_invalid_mixed_model and not has_valid_mixed_model and not has_onlyintron_model and not has_onlyexo_model and not has_only_span_exin_model: # Not validated and mapping to exon and introns, happens rarely in 10X / irrelevant. return if has_valid_mixed_model and not has_onlyintron_model and not has_onlyexo_model and not has_only_span_exin_model: # Validated and mapping to exon and introns, happens rarely in 10X. Count as unspliced. gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return if has_onlyintron_model and has_onlyexo_model and not has_mixed_model: # Ambiguity among the transcript models compatible with the mapping, most common case! Count ambiguous gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return if has_onlyintron_model and not has_onlyexo_model and has_mixed_model: # Ambiguity among the transcript models compatible with the mapping. Very rare. gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return if not has_onlyintron_model and has_onlyexo_model and has_mixed_model: # Ambiguity among the transcript models compatible with the mapping. Count ambiguous gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return if has_onlyintron_model and has_onlyexo_model and has_mixed_model: # Ambiguity among the transcript models compatible with the mapping. Very rare. gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return
[docs]class ObservedSpanning10X(Logic): """ObservedSpanning10X logic for 10X Genomics chemistry This differ from the other 10x Logics because: - singletons if the fall in not validated introns are DISCARDED - singletons if the fall in validated introns are DISCARDED - non-singletons if are supported by not validated introns are DISCARDED - non-singletons if are supported by validated introns are DISCARDED - Therefore only the observed intron spanning reads are counted as UNSPLICED """ def __init__(self) -> None: self.name = "ObservedSpanning10X" @property def layers(self) -> List[str]: # This should be overridden if a different set of layers is desired return ["spliced", "unspliced", "ambiguous"] @property def stranded(self) -> bool: return True @property def perform_validation_markup(self) -> bool: return True @property def accept_discordant(self) -> bool: return False
[docs] def count(self, molitem: vcy.Molitem, cell_bcidx: int, dict_layers_columns: Dict[str, np.ndarray], geneid2ix: Dict[str, int]) -> None: # NOTE This can be simplified qyuite a bit, without loss of acuracy! # The hits are not compatible with any annotated transcript model spliced = dict_layers_columns["spliced"] unspliced = dict_layers_columns["unspliced"] ambiguous = dict_layers_columns["ambiguous"] # The hits are not compatible with any annotated transcript model if len(molitem.mappings_record) == 0: return # Compatible with one or more transcript models: else: # Check that there are not different possible genes ?? if len(set(i.geneid for i in molitem.mappings_record.keys())) == 1: gene_check: Set[str] = set() has_onlyintron_model = 0 has_only_span_exin_model = 1 has_onlyintron_and_valid_model = 0 has_valid_mixed_model = 0 has_invalid_mixed_model = 0 has_onlyexo_model = 0 has_mixed_model = 0 multi_gene = 0 for transcript_model, segments_list in molitem.mappings_record.items(): gene_check.add(transcript_model.geneid) if len(gene_check) > 1: multi_gene = 1 has_introns = 0 has_exons = 0 has_exseg_with_spliced_flag = 0 has_validated_intron = 0 has_exin_intron_span = 0 has_non3prime = 0 for segment_match in segments_list: if segment_match.maps_to_intron: has_introns = 1 if segment_match.feature.is_validated: has_validated_intron = 1 if segment_match.feature.end_overlaps_with_part_of(segment_match.segment): downstream_exon = segment_match.feature.get_downstream_exon() if downstream_exon.start_overlaps_with_part_of(segment_match.segment): has_exin_intron_span = 1 if segment_match.feature.start_overlaps_with_part_of(segment_match.segment): upstream_exon = segment_match.feature.get_upstream_exon() if upstream_exon.end_overlaps_with_part_of(segment_match.segment): has_exin_intron_span = 1 elif segment_match.maps_to_exon: has_exons = 1 if not segment_match.feature.is_last_3prime: has_non3prime = 1 if segment_match.is_spliced: has_exseg_with_spliced_flag = 1 if has_validated_intron and not has_exons: has_onlyintron_and_valid_model = 1 if has_introns and not has_exons: has_onlyintron_model = 1 if has_exons and not has_introns: has_onlyexo_model = 1 if has_exons and has_introns and not has_validated_intron and not has_exin_intron_span: has_invalid_mixed_model = 1 has_mixed_model = 1 if has_exons and has_introns and has_validated_intron and not has_exin_intron_span: has_valid_mixed_model = 1 has_mixed_model = 1 if not has_exin_intron_span: has_only_span_exin_model = 0 if multi_gene: # Many genes are compatible with the observation, do not count return else: if not len(molitem.mappings_record): # No gene is compatible with the observation, do not count return else: if has_onlyexo_model and not has_onlyintron_model and not has_mixed_model: # More common situation, normal exonic read, count as spliced gene_ix = geneid2ix[transcript_model.geneid] spliced[gene_ix, cell_bcidx] += 1 return if has_only_span_exin_model: # All the compatible transcript models have spanning exon-intron boundaries, count unspliced gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return if has_onlyintron_and_valid_model and not has_mixed_model and not has_onlyexo_model: if len(segments_list) == 1: # Singleton in validated intron, do not count return else: # Non-singleton in validated intron, do not count return if has_onlyintron_model and not has_onlyintron_and_valid_model and not has_mixed_model and not has_onlyexo_model: if len(segments_list) == 1: # Singleton in non-validated intron return else: # Non-singleton in non-validated intron return if has_invalid_mixed_model and not has_valid_mixed_model and not has_onlyintron_model and not has_onlyexo_model and not has_only_span_exin_model: # Not validated and mapping to exon and introns, happens rarely in 10X / irrelevant. return if has_valid_mixed_model and not has_onlyintron_model and not has_onlyexo_model and not has_only_span_exin_model: # Validated and mapping to exon and introns, happens rarely in 10X. Count as unspliced. gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return if has_onlyintron_model and has_onlyexo_model and not has_mixed_model: # Ambiguity among the transcript models compatible with the mapping, most common case! Count ambiguous gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return if has_onlyintron_model and not has_onlyexo_model and has_mixed_model: # Ambiguity among the transcript models compatible with the mapping. Very rare. gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return if not has_onlyintron_model and has_onlyexo_model and has_mixed_model: # Ambiguity among the transcript models compatible with the mapping. Count ambiguous gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return if has_onlyintron_model and has_onlyexo_model and has_mixed_model: # Ambiguity among the transcript models compatible with the mapping. Very rare. gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return
[docs]class Discordant10X(Logic): """Just a test """ def __init__(self) -> None: self.name = "Discordant10X" @property def layers(self) -> List[str]: # This should be overridden if a different set of layers is desired return ["spliced", "unspliced", "ambiguous"] @property def stranded(self) -> bool: return True @property def perform_validation_markup(self) -> bool: return True @property def accept_discordant(self) -> bool: return True
[docs] def count(self, molitem: vcy.Molitem, cell_bcidx: int, dict_layers_columns: Dict[str, np.ndarray], geneid2ix: Dict[str, int]) -> None: # NOTE This can be simplified qyuite a bit, without loss of acuracy! # The hits are not compatible with any annotated transcript model spliced = dict_layers_columns["spliced"] unspliced = dict_layers_columns["unspliced"] ambiguous = dict_layers_columns["ambiguous"] # The hits are not compatible with any annotated transcript model if len(molitem.mappings_record) == 0: return # Compatible with one or more transcript models: else: # Check that there are not different possible genes ?? if len(set(i.geneid for i in molitem.mappings_record.keys())) == 1: gene_check: Set[str] = set() has_onlyintron_model = 0 has_only_span_exin_model = 1 has_onlyintron_and_valid_model = 0 has_valid_mixed_model = 0 has_invalid_mixed_model = 0 has_onlyexo_model = 0 has_mixed_model = 0 multi_gene = 0 for transcript_model, segments_list in molitem.mappings_record.items(): gene_check.add(transcript_model.geneid) if len(gene_check) > 1: multi_gene = 1 has_introns = 0 has_exons = 0 has_exseg_with_spliced_flag = 0 has_validated_intron = 0 has_exin_intron_span = 0 has_non3prime = 0 for segment_match in segments_list: if segment_match.maps_to_intron: has_introns = 1 if segment_match.feature.is_validated: has_validated_intron = 1 if segment_match.feature.end_overlaps_with_part_of(segment_match.segment): downstream_exon = segment_match.feature.get_downstream_exon() if downstream_exon.start_overlaps_with_part_of(segment_match.segment): has_exin_intron_span = 1 if segment_match.feature.start_overlaps_with_part_of(segment_match.segment): upstream_exon = segment_match.feature.get_upstream_exon() if upstream_exon.end_overlaps_with_part_of(segment_match.segment): has_exin_intron_span = 1 elif segment_match.maps_to_exon: has_exons = 1 if not segment_match.feature.is_last_3prime: has_non3prime = 1 if segment_match.is_spliced: has_exseg_with_spliced_flag = 1 if has_validated_intron and not has_exons: has_onlyintron_and_valid_model = 1 if has_introns and not has_exons: has_onlyintron_model = 1 if has_exons and not has_introns: has_onlyexo_model = 1 if has_exons and has_introns and not has_validated_intron and not has_exin_intron_span: has_invalid_mixed_model = 1 has_mixed_model = 1 if has_exons and has_introns and has_validated_intron and not has_exin_intron_span: has_valid_mixed_model = 1 has_mixed_model = 1 if not has_exin_intron_span: has_only_span_exin_model = 0 if multi_gene: # Many genes are compatible with the observation, do not count return else: if not len(molitem.mappings_record): # No gene is compatible with the observation, do not count return else: if has_onlyexo_model and not has_onlyintron_model and not has_mixed_model: # More common situation, normal exonic read, count as spliced gene_ix = geneid2ix[transcript_model.geneid] spliced[gene_ix, cell_bcidx] += 1 return if has_only_span_exin_model: # All the compatible transcript models have spanning exon-intron boundaries, count unspliced gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return if has_onlyintron_and_valid_model and not has_mixed_model and not has_onlyexo_model: if len(segments_list) == 1: # Singleton in validated intron gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return else: # Non-singleton in validated intron gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return if has_onlyintron_model and not has_onlyintron_and_valid_model and not has_mixed_model and not has_onlyexo_model: if len(segments_list) == 1: # Singleton in non-validated intron gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return else: # Non-singleton in non-validated intron gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return if has_invalid_mixed_model and not has_valid_mixed_model and not has_onlyintron_model and not has_onlyexo_model and not has_only_span_exin_model: # Not validated and mapping to exon and introns, happens rarely in 10X / irrelevant. Count anyways gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return if has_valid_mixed_model and not has_onlyintron_model and not has_onlyexo_model and not has_only_span_exin_model: # Validated and mapping to exon and introns, happens rarely in 10X. Count as unspliced. gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return if has_onlyintron_model and has_onlyexo_model and not has_mixed_model: # Ambiguity among the transcript models compatible with the mapping, most common case! Count ambiguous gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return if has_onlyintron_model and not has_onlyexo_model and has_mixed_model: # Ambiguity among the transcript models compatible with the mapping. Very rare. Count ambiguous gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return if not has_onlyintron_model and has_onlyexo_model and has_mixed_model: # Ambiguity among the transcript models compatible with the mapping. Count ambiguous gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return if has_onlyintron_model and has_onlyexo_model and has_mixed_model: # Ambiguity among the transcript models compatible with the mapping. Very rare. Count ambiguous gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return
[docs]class SmartSeq2(Logic): """SmartSeq2 logic """ def __init__(self) -> None: self.name = "SmartSeq2" @property def layers(self) -> List[str]: # This should be overridden if a different set of layers is desired return ["spliced", "unspliced", "ambiguous", "spanning"] @property def stranded(self) -> bool: return False @property def perform_validation_markup(self) -> bool: return False @property def accept_discordant(self) -> bool: return False
[docs] def count(self, molitem: vcy.Molitem, cell_bcidx: int, dict_layers_columns: Dict[str, np.ndarray], geneid2ix: Dict[str, int]) -> None: # NOTE This can be simplified qyuite a bit, without loss of acuracy! # The hits are not compatible with any annotated transcript model spliced = dict_layers_columns["spliced"] unspliced = dict_layers_columns["unspliced"] ambiguous = dict_layers_columns["ambiguous"] spanning = dict_layers_columns["spanning"] if len(molitem.mappings_record) == 0: return # Compatible with one or more transcript models: else: # Check that there are not different possible genes ?? if len(set(i.geneid for i in molitem.mappings_record.keys())) == 1: gene_check: Set[str] = set() has_onlyintron_model = 0 has_only_span_exin_model = 1 has_onlyexo_model = 0 has_mixed_model = 0 multi_gene = 0 for transcript_model, segments_list in molitem.mappings_record.items(): gene_check.add(transcript_model.geneid) if len(gene_check) > 1: multi_gene = 1 has_introns = 0 has_exons = 0 has_exseg_with_spliced_flag = 0 has_exin_intron_span = 0 for segment_match in segments_list: if segment_match.maps_to_intron: has_introns = 1 if segment_match.feature.end_overlaps_with_part_of(segment_match.segment): downstream_exon = segment_match.feature.get_downstream_exon() if downstream_exon.start_overlaps_with_part_of(segment_match.segment): has_exin_intron_span = 1 if segment_match.feature.start_overlaps_with_part_of(segment_match.segment): upstream_exon = segment_match.feature.get_upstream_exon() if upstream_exon.end_overlaps_with_part_of(segment_match.segment): has_exin_intron_span = 1 elif segment_match.maps_to_exon: has_exons = 1 if segment_match.is_spliced: has_exseg_with_spliced_flag = 1 if has_introns and not has_exons: has_onlyintron_model = 1 if has_exons and not has_introns: has_onlyexo_model = 1 if has_exons and has_introns and not has_exin_intron_span: has_valid_mixed_model = 1 has_mixed_model = 1 if not has_exin_intron_span: has_only_span_exin_model = 0 if multi_gene: # Many genes are compatible with the observation, do not count return else: if not len(molitem.mappings_record): # NOTE it does not happen for Smartseq2 # No gene is compatible with the observation, do not count return else: if has_onlyexo_model and not has_onlyintron_model and not has_mixed_model: # More common situation, normal exonic read, count as spliced gene_ix = geneid2ix[transcript_model.geneid] spliced[gene_ix, cell_bcidx] += 1 return if has_only_span_exin_model: # NOTE This is what I want to count as spanning # All the compatible transcript models have spanning exon-intron boundaries, count unspliced gene_ix = geneid2ix[transcript_model.geneid] spanning[gene_ix, cell_bcidx] += 1 return if has_onlyintron_model and not has_mixed_model and not has_onlyexo_model: gene_ix = geneid2ix[transcript_model.geneid] unspliced[gene_ix, cell_bcidx] += 1 return if has_onlyintron_model and has_onlyexo_model and not has_mixed_model: # Ambiguity among the transcript models compatible with the mapping, most common case! Count ambiguous gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return if not has_onlyintron_model and has_onlyexo_model and has_mixed_model: # NOTE has_mixed model is used only here in this logic # Ambiguity among the transcript models compatible with the mapping. Count ambiguous gene_ix = geneid2ix[transcript_model.geneid] ambiguous[gene_ix, cell_bcidx] += 1 return
Default: type = Permissive10X