-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSubtitle Alignment.java
More file actions
208 lines (173 loc) · 7.25 KB
/
Subtitle Alignment.java
File metadata and controls
208 lines (173 loc) · 7.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
/* Important constants used in our program */
static final int number_of_APIs = a; // number of speech-to-text APIs available in our system
static int audio_samples_millisecond; // usually found in an audio file's header information
static final double size_frame = 30.0; // usual size of an audio frame
static number_frames; // this is the number of virtual audio frames composing the audio stream
static int used_API = Random.nextInt(number); // random selection of an API
static double[] sound_energy; // array of doubles that will store the audio energy of each frame
static double silence_threshold_percentage = b; // used silence_threshold_percentage
static double silence_threshold; // silence threshold for this video
static final int min_speech_segment = c; // minimum number of frames needed for word utterance
static final int min_distanceTo_silence = d; // minimum number of frames needed between silent frames
static final int consecutive_silence_threshold = e; // minimum number of consecutive silence frames
static final int offset = (int)(consecutive_silence_threshold / 2);
/* Function Declarations. These are all the functions used in the main body of our program. */
// Function that initializes all uninitilaized variables above and
// returns audio file of the stream for use with the speech-to-text API
Audio Initializer(String VideoFileName)
{
Audio file = // Extract audio from video
int length = // obtain the duration, in milliseconds, of audio file from file's header info
int last_frame = length % size_frame; // obtain the size of the last frame
int audio_samples_millisecond = // obtain this number from audio file's header info
int number_frames = Math.ceil(length / size_frame); // obtain number of audio frames
// initialize sound energy array and calculate energy for each frame
sound_energy = new double[number_frames];
for(int i = 0; i < number_frames - 1; i += 1)
{
// sum the energy level of all the audio samples in this audio frame
double sum = 0;
int N = audio_samples_millisecond * 30;
for(j = 0; j < N; j += 1)
{
sum += // add sound energy of this audio sample
}
// take the square root of this sum and divide it by N. This
// will be the array entry in the sound_energy array
sound_energy[i] = Math.sqrt(sum) / N;
}
// calculate energy level of last frame
double sum = 0;
int N = audio_samples_millisecond * last_frame;
for (int i = 0; i < N; i += 1)
{
sum += // add sound energy of this audio sample
}
// take the square root of this sum and divide it by N. This
// will be the array entry in the sound_energy array
sound_energy[number_frames - 1] = Math.sqrt(sum) / N;
// calculate silence_threshold
double[] temp = double[number_frames];
for (int i = 0; i < number_frames; i += 1)
{
temp[i] = sound_energy[i];
}
temp.sort();
silence_threshold = temp[(int)silence_threshold_percentage * number_frames];
// finally, return audio file of the video's audio stream
return file;
}
// Functions for API's. Return transcribed text in form of String
String API0(Audio chunk)
{
// String variable that will store transcription result
String result;
// Utilize the API to transcribe this chunk...
return result;
}
String API1(Audio chunk)
{
// String variable that will store transcription result
String result;
// Utilize the API to transcribe this chunk...
return result;
}
String API2(Audio chunk)
{
// String variable that will store transcription result
String result;
// Utilize the API to transcribe this chunk...
return result;
}
// Function that utilizes the randomly chosen API at the beginning to transcribe the audio chunk
String SpeechToText(Audio chunk)
{
// switch statement for producing the text using whatever API
// was randomly chosen at the beginning of the program
switch(used_API)
{
case 0:
return API0(chunk);
case 1:
return API1(chunk);
case 2:
return API2(chunk);
}
}
// Function that provides the .srt file for given mp3 argument.
// Return true for success and false for failure
boolean SubtitleGenerator(String AudioFileName)
{
// Counter for speech segment (used in .srt file)
int id = 1;
// Make new file and append to it ".srt" extension
PrintWriter writer = new PrintWriter(AudioFileName + ".srt", "UTF-8");
/* go through sound energy array looking for speech segments */
int i = 0;
int last_silence = 0;
int num_silent_frames = 0;
while (i < number_frames)
{
// mark the beginning and ending of speech
int beginning = i;
int end = i;
// As long as there is possible speech...
while (num_silent_frames <= consecutive_silence_threshold)
{
if (sound_energy[i] <= silence_threshold)
{
num_silent_frames += 1;
}
else
{
num_silent_frames = 0;
}
end += 1;
i += 1;
}
// mark the end of this silence sequence
int this_silence = i;
// If there was possible speech at least length of min_speech_segment...
if ((end - index) > min_speech_segment && (last_silence - this_silence) > min_distanceTo_silence)
{
last_silence = this_silence; // reset last_silence
Audio chunk = // obtain chunk of audio file correspoding to interval
String result = SpeechToText(chunk); // obtain transcription of chunk
/*** write that segment of speech to .srt file ***/
// first write id of the speech segment
writer.println(id);
// convert from frames to milliseconds
beginning *= size_frame;
end *= size_frame;
// calculate start time of this speech segment
int hours1 = beginning / (60 * 60 * 1000);
int minutes1 = (beginning / (60 * 1000)) % 60;
int seconds1 = (beginning / 1000) % 60;
int millseconds1 = beginning % 1000;
// calculate end time of this speech segement
end -= offset;
int hours2 = end / (60 * 60 * 1000);
int minutes2 = (end / (60 * 1000)) % 60;
int seconds2 = (end / 1000) % 60;
int millseconds2 = end % 1000;
// write start and end times to .srt file
writer.println(String.format("%02d", hours1)
+ ":" + String.format("%02d", minutes1)
+ ":" + String.format("%02d", seconds1)
+ "," + String.format("%03d", milliseconds1)
+ " --> " + String.format("%02d", hours21)
+ ":" + String.format("%02d", minutes2)
+ ":" + String.format("%02d", seconds2)
+ "," + String.format("%03d", milliseconds2));
// write the transcription
writer.println(string);
writer.println();
id += 1; // increase id for next subtitle
del(chunk); // delete the temporary chunk Audio file
}
}
// close writer (SRT) file
writer.close();
del(AudioFileName); // delete temporary mp3 file of audio stream
return true;
}