-
Notifications
You must be signed in to change notification settings - Fork 0
/
wav2vec_Emotion.py
executable file
·117 lines (98 loc) · 3.64 KB
/
wav2vec_Emotion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import torch
from datasets import load_dataset
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, TrainingArguments, Trainer
import numpy as np
import librosa
# Load datasets
data_files = {
"train": "./train_voice_sent.csv",
"validation": "./dev_voice_sent.csv",
"test": "./test_voice_sent.csv" # Add the test dataset
}
dataset = load_dataset("csv", data_files=data_files, delimiter=",")
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]
test_dataset = dataset["test"] # Load the test dataset
# Specify the input and output columns
input_column = "file_path"
output_column = "Sentiment"
# Get unique labels
label_list = train_dataset.unique(output_column)
label_list.sort()
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")
# Create label to id mapping
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}
# Load pre-trained model and feature extractor
model_name = "r-f/wav2vec-english-speech-emotion-recognition"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
model = AutoModelForAudioClassification.from_pretrained(
model_name,
num_labels=num_labels,
label2id=label2id,
id2label=id2label
)
# Define function to load and preprocess audio
def preprocess_function(examples):
max_length = 16000 * 10 # 10 seconds at 16kHz sampling rate
def load_audio(file_path):
audio, sr = librosa.load(file_path, sr=16000)
if len(audio) > max_length:
audio = audio[:max_length]
elif len(audio) < max_length:
# Pad audio if it's shorter than 10 seconds
padding = np.zeros(max_length - len(audio))
audio = np.concatenate((audio, padding))
return audio
audio_arrays = [load_audio(file_path) for file_path in examples[input_column]]
inputs = feature_extractor(
audio_arrays,
sampling_rate=16000,
padding="max_length",
max_length=max_length,
truncation=True,
return_tensors="pt"
)
inputs["labels"] = [label2id[label] for label in examples[output_column]]
return inputs
# Preprocess the datasets
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(preprocess_function, batched=True, remove_columns=eval_dataset.column_names)
test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)
# Define training arguments
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=10,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
evaluation_strategy="steps",
eval_steps=1000,
save_steps=1000,
load_best_model_at_end=True,
)
# Create Trainer instance
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=lambda p: {
"accuracy": (p.predictions.argmax(-1) == p.label_ids).mean()
},
)
# Train the model
trainer.train()
# Evaluate the model on the test set
test_results = trainer.evaluate(test_dataset)
print(f"Test results: {test_results}")
# Save the model
model.save_pretrained("./wav2vec2-sent-recognition")
feature_extractor.save_pretrained("./wav2vec2-sent-recognition")
# Save the feature extractor part of the model
feature_extractor_model = model.wav2vec2
feature_extractor_model.save_pretrained("./wav2vec_sent-recognition-feature-extractor")