@article{Padilha_Alkhnbashi_Shah-CRISP_Machi_learn-2020,
author = {Padilha, Victor A. and Alkhnbashi, Omer S. and Shah, Shiraz 
          A. and de Carvalho, Andre C. P. L. F. and Backofen, Rolf},
title = {{CRISPRcasIdentifier}: {Machine} learning for accurate 
         identification and classification of {CRISPR}-{Cas} systems},
journal = {Gigascience},
year = {2020},
doi = {10.1093/gigascience/giaa062},
volume = {9},
user = {alkhanbo},
pmid = {32556168},
pages = {},
number = {6},
issn = {2047-217X},
abstract = {BACKGROUND: CRISPR-Cas genes are extraordinarily diverse 
            and evolve rapidly when compared to other prokaryotic genes. 
            With the rapid increase in newly sequenced archaeal and 
            bacterial genomes, manual identification of CRISPR-Cas 
            systems is no longer viable. Thus, an automated approach is 
            required for advancing our understanding of the evolution 
            and diversity of these systems and for finding new 
            candidates for genome engineering in eukaryotic models. 
            RESULTS: We introduce CRISPRcasIdentifier, a new machine 
            learning-based tool that combines regression and 
            classification models for the prediction of potentially 
            missing proteins in instances of CRISPR-Cas systems and the 
            prediction of their respective subtypes. In contrast to 
            other available tools, CRISPRcasIdentifier can both detect 
            cas genes and extract potential association rules that 
            reveal functional modules for CRISPR-Cas systems. In our 
            experimental benchmark on the most recently published and 
            comprehensive CRISPR-Cas system dataset, CRISPRcasIdentifier 
            was compared with recent and state-of-the-art tools. 
            According to the experimental results, CRISPRcasIdentifier 
            presented the best Cas protein identification and subtype 
            classification performance. CONCLUSIONS: Overall, our tool 
            greatly extends the classification of CRISPR cassettes and, 
            for the first time, predicts missing Cas proteins and 
            association rules between Cas proteins. Additionally, we 
            investigated the properties of CRISPR subtypes. The proposed 
            tool relies not only on the knowledge of manual CRISPR 
            annotation but also on models trained using machine 
            learning.}
}

