@article {10.3844/ajbbsp.2025.176.189, article_type = {journal}, title = {Random Forest-Based Genome Analysis for Disease Association and SNP Marker Identification}, author = {Liu, Xiangdong}, volume = {21}, number = {2}, year = {2025}, month = {Jul}, pages = {176-189}, doi = {10.3844/ajbbsp.2025.176.189}, url = {https://thescipub.com/abstract/ajbbsp.2025.176.189}, abstract = {At present, genomic data analysis has the problems of insufficient model interpretation and limited computational efficiency in disease association research. As a powerful machine learning algorithm, Random Forest has demonstrated excellent ability in processing complex and multi-dimensional data sets and has gradually become a popular tool in the field of bioinformatics. The purpose of this paper is to explore how RF can be applied to genome data mining and reveal the potential relationship between gene variation and disease. The study collected whole-genome sequencing data from 10,000 patients and labeled whether they had a specific type of cardiovascular disease. RF was used to construct a prediction model, and the SNP loci closely related to the occurrence of diseases were identified. The results show that compared with traditional statistical methods, the AUC value of the RF model reaches 0.92, the accuracy rate is as high as 89%, and the sensitivity and specificity are 87 and 90%, respectively. This indicates that RF can effectively identify key genetic markers, providing a valuable list of candidate genes for subsequent studies. In order to further verify the effectiveness of the RF model, this study selected the top 50 high-risk SNPs for functional annotation analysis. These variants were found to be mainly focused on genes known to be involved in lipid metabolism, inflammatory response, and immunomodulatory pathways. rs6511723 is located in the APOE-C1/C4/C2 region, which is involved in cholesterol transport; rs1121980, on the other hand, is close to the FTO gene, which can affect weight and obesity risk by encoding a fatty acid oxidase. The ROC curve of MHILDA based on 5x validation is close to the true prediction interval, and its average AUC is 90.45%, which fully reflects its stable performance, and combined with experimental data, it can be determined that the MHILDA model shows excellent performance on the reference dataset and can accurately predict the potential LDA.}, journal = {American Journal of Biochemistry and Biotechnology}, publisher = {Science Publications} }