Skip to content

Commit

Permalink
add Probability in breed identifier module
Browse files Browse the repository at this point in the history
  • Loading branch information
guoyingwei6 committed Sep 9, 2024
1 parent 19262e6 commit 2420a2e
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 27 deletions.
Binary file modified attachments/Breed_identifier_accurate_model.pkl
Binary file not shown.
Binary file modified attachments/Breed_identifier_fast_model.pkl
Binary file not shown.
22 changes: 10 additions & 12 deletions modules/Breed_identifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,17 @@ def load_model_accurate():
return clf

def breed_classifier(genotype_array, model='accurate'):
"""品种分类函数。"""
"""品种分类加预测概率函数。"""
if model == 'fast':
clf = load_model_fast()
elif model == 'accurate':
clf = load_model_accurate()
prediction = clf.predict(genotype_array)
predictions = clf.predict(genotype_array)
probs = clf.predict_proba(genotype_array)
max_probs = np.max(probs, axis=1) # 获取最大概率
breed_code_dict = load_breed_codes()
breed_prediction = [breed_code_dict[code] for code in prediction]
return breed_prediction
breed_predictions = [breed_code_dict[code] for code in predictions]
return list(zip(breed_predictions, max_probs)) # 返回标签和最大概率的元组列表



Expand All @@ -45,14 +47,10 @@ def analysis():
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
# 使用 fit_transform 方法填充缺失值
gt_array_imputed = imputer.fit_transform(gt_array)


result = breed_classifier(gt_array_imputed, model=model)
# 样本名和预测结果合并
combined_results = list(zip(sample_names, result))
for sample, breed in combined_results:
print(f"{sample}: {breed}")

results = breed_classifier(gt_array_imputed, model=model)
results_df = pd.DataFrame(results, columns=['Breed', 'Probability'])
results_df.insert(0, 'Sample', sample_names) # 将样本名插入到结果DataFrame的第一列
print(results_df.to_string(index=False)) # 输出结果DataFrame,不显示索引

if __name__ == '__main__':
'''python3 modules/Breed_identifier.py attachments/genotypes_for_Breed_identifier_accurate_model.txt accurate'''
Expand Down
43 changes: 29 additions & 14 deletions pages/01Breed_identification.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def page_frame():
we found the workflow using RF as feature selector and SVM as classifier has the best performance.
For more detailed information on the accuracy of different models and factors influencing the accuracy, please refer to our paper.
Here, we provide the classification models with 500 and 2000 SNPs, respectively.
Here, we provide the classification models with 100 and 1,000 SNPs, respectively.
You can choose the model according to your data and expectations.
''')
Expand Down Expand Up @@ -49,11 +49,13 @@ def page_frame():
**3. Select the model to use for analysis.**
- There are two models available: '**fast**' and '**accurate**'.
- The 'fast' model uses 100 SNPs, while the 'accurate' model uses 1000 SNPs.
- The 'fast' model uses 100 SNPs, while the 'accurate' model uses 1,000 SNPs.
- The 'fast' model is recommended for quick analysis, while the 'accurate' model provides more accurate results.
**4. Click the 'Analyze' button to predict the breed.**
- You can use the demo file mentioned above to test the tool and see the output details.
- You can see the predicted breed and the probability of the prediction for each individual in the results table.
- If the probability is **below 0.4**, the prediction may be less reliable, suggesting that the individual could be a mixed breed.
In this case, you can consider using the **GBC estimator tool** to estimate the genomic breed content of the individual.
''')
st.success('''## Analysis''')

Expand All @@ -76,22 +78,36 @@ def load_model_accurate():
clf = joblib.load('attachments/Breed_identifier_accurate_model.pkl')
return clf

#之前用于预测品种的函数,无法预测概率,更换了可以预测概率的模型
#def breed_classifier(genotype_array, model='accurate'):
# """品种分类函数。"""
# if model == 'fast':
# clf = load_model_fast()
# elif model == 'accurate':
# clf = load_model_accurate()
# prediction = clf.predict(genotype_array)
# breed_code_dict = load_breed_codes()
# breed_prediction = [breed_code_dict[code] for code in prediction]
# return breed_prediction

def breed_classifier(genotype_array, model='accurate'):
"""品种分类函数。"""
"""品种分类加预测概率函数。"""
if model == 'fast':
clf = load_model_fast()
elif model == 'accurate':
clf = load_model_accurate()
prediction = clf.predict(genotype_array)
predictions = clf.predict(genotype_array)
probs = clf.predict_proba(genotype_array)
max_probs = np.max(probs, axis=1) # 获取最大概率
breed_code_dict = load_breed_codes()
breed_prediction = [breed_code_dict[code] for code in prediction]
return breed_prediction
breed_predictions = [breed_code_dict[code] for code in predictions]
return list(zip(breed_predictions, max_probs)) # 返回标签和最大概率的元组列表



def analysis():
uploaded_file = st.file_uploader("Please upload a genotype file to begin analysis")
model_choice = st.selectbox('Choose the model to use for analysis:', ['accurate', 'fast'], index=0) # 默认选择'fast'
model_choice = st.selectbox('Choose the model to use for analysis:', ['accurate', 'fast'], index=0) # 默认选择'accurate'
if uploaded_file is not None:
try:
gt_df = pd.read_csv(uploaded_file, sep='\s+', header=None)
Expand All @@ -111,13 +127,12 @@ def analysis():

if st.button('Analyze'):
if 'gt_array_imputed' in st.session_state and 'model_choice' in st.session_state:
result = breed_classifier(st.session_state.gt_array_imputed, model=st.session_state.model_choice)
# 样本名和预测结果合并
combined_results = list(zip(st.session_state.sample_names, result))
st.session_state.result = combined_results
results = breed_classifier(st.session_state.gt_array_imputed, model=st.session_state.model_choice)
results_df = pd.DataFrame(results, columns=['Breed', 'Probability'])
results_df.insert(0, 'Sample', sample_names) # 将样本名插入到结果DataFrame的第一列
st.session_state.results_df = results_df
st.subheader('Analysis Results')
for sample, breed in combined_results:
st.write(f"{sample}: {breed}")
st.table(st.session_state.results_df)
else:
st.error("No genotype data to analyze. Please upload a file and select a model.")

Expand Down
2 changes: 1 addition & 1 deletion pages/02GBC_estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
- A genotype file (**recoded by 0, 1 and 2**) is needed with **one individual per column** and **one SNP per line**.
The first column should be the SNP ID (CHR:POS) based on **ARS-UCD2.0** and the first row should be the sample ID.
- The file should be in the format of a **space or tab-separated** text file.
- More accurate results depend on more SNPs. We recommend using a file with **at least 1000 SNPs**, and **50,000 SNPs** above are highly recommended.
- More accurate results depend on more SNPs. We recommend using a file with **at least 1,000 SNPs**, and **50,000 SNPs** above are highly recommended.
- **Missing values (NA)** do not affect the analysis, but the more missing values, the less accurate the results.
So, we highly recommend performing **imputation** with BEAGLE before analysis if your data contains missing values.
- If you don't have a genotype file now or want to see the details of the file format,
Expand Down

0 comments on commit 2420a2e

Please sign in to comment.