@article{open578, volume = {8}, number = {2}, author = {Aarti Garg and G.P.S. Raghava}, note = {OPEN ACCESS}, title = {A machine learning based method for the prediction of secretory proteins using amino acid composition, their order and similarity-search.}, publisher = {Bioinformatiob System eV.}, year = {2008}, journal = {In silico biology}, pages = {129--40}, keywords = {classical pathway, non-classical pathway, secretory proteins, prediction, SRTpred, redundancy, dataset size, ANN, SVM, BLAST, PSI-BLAST, N-terminal sequence }, url = {http://crdd.osdd.net/open/578/}, abstract = {Most of the prediction methods for secretory proteins require the presence of a correct N-terminal end of the preprotein for correct classification. As large scale genome sequencing projects sometimes assign the 5'-end of genes incorrectly, many proteins are encoded without the correct N-terminus leading to incorrect prediction. In this study, a systematic attempt has been made to predict secretory proteins irrespective of presence or absence of N-terminal signal peptides (also known as classical and non-classical secreted proteins respectively), using machine-learning techniques; artificial neural network (ANN) and support vector machine (SVM). We trained and tested our methods on a dataset of 3321 secretory and 3654 non-secretory mammalian proteins using five-fold cross-validation technique. First, ANN-based modules have been developed for predicting secretory proteins using 33 physico-chemical properties, amino acid composition and dipeptide composition and achieved accuracies of 73.1\%, 76.1\% and 77.1\%, respectively. Similarly, SVM-based modules using 33 physico-chemical properties, amino acid, and dipeptide composition have been able to achieve accuracies of 77.4\%, 79.4\% and 79.9\%, respectively. In addition, BLAST and PSI-BLAST modules designed for predicting secretory proteins based on similarity search achieved 23.4\% and 26.9\% accuracy, respectively. Finally, we developed a hybrid-approach by integrating amino acid and dipeptide composition based SVM modules and PSI-BLAST module that increased the accuracy to 83.2\%, which is significantly better than individual modules. We also achieved high sensitivity of 60.4\% with low value of 5\% false positive predictions using hybrid module. A web server SRTpred has been developed based on above study for predicting classical and non-classical secreted proteins from whole sequence of mammalian proteins, which is available from http://www.imtech.res.in/raghava/srtpred/.} }