Subtitle-Alignment-Algorithm/Subtitle Alignment.java at master · ragymorkos/Subtitle-Alignment-Algorithm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
/* Important constants used in our program */
static final int number_of_APIs = a; // number of speech-to-text APIs available in our system
static int audio_samples_millisecond; // usually found in an audio file's header information
static final double size_frame = 30.0; // usual size of an audio frame
static number_frames; // this is the number of virtual audio frames composing the audio stream
static int used_API = Random.nextInt(number); // random selection of an API
static double[] sound_energy; // array of doubles that will store the audio energy of each frame
static double silence_threshold_percentage = b; // used silence_threshold_percentage
static double silence_threshold; // silence threshold for this video
static final int min_speech_segment = c; // minimum number of frames needed for word utterance
static final int min_distanceTo_silence = d; // minimum number of frames needed between silent frames
static final int consecutive_silence_threshold = e; // minimum number of consecutive silence frames
static final int offset = (int)(consecutive_silence_threshold / 2);

/* Function Declarations. These are all the functions used in the main body of our program. */

// Function that initializes all uninitilaized variables above and
// returns audio file of the stream for use with the speech-to-text API
Audio Initializer(String VideoFileName)
{
   Audio file = // Extract audio from video

   int length = // obtain the duration, in milliseconds, of audio file from file's header info
   int last_frame = length % size_frame; // obtain the size of the last frame
   int audio_samples_millisecond = // obtain this number from audio file's header info
   int number_frames = Math.ceil(length / size_frame); // obtain number of audio frames

   // initialize sound energy array and calculate energy for each frame
   sound_energy = new double[number_frames];
   for(int i = 0; i < number_frames - 1; i += 1)
   {
      // sum the energy level of all the audio samples in this audio frame
      double sum = 0;
      int N = audio_samples_millisecond * 30;
      for(j = 0; j < N; j += 1)
      {
         sum += // add sound energy of this audio sample
      }

      // take the square root of this sum and divide it by N. This
      // will be the array entry in the sound_energy array
      sound_energy[i] = Math.sqrt(sum) / N;
   }
   // calculate energy level of last frame
   double sum = 0;
   int N = audio_samples_millisecond * last_frame;
   for (int i = 0; i < N; i += 1)
   {
      sum += // add sound energy of this audio sample
   }
   // take the square root of this sum and divide it by N. This
   // will be the array entry in the sound_energy array
   sound_energy[number_frames - 1] = Math.sqrt(sum) / N;

   // calculate silence_threshold
   double[] temp = double[number_frames];
   for (int i = 0; i < number_frames; i += 1)
   {
      temp[i] = sound_energy[i];
   }
   temp.sort();
   silence_threshold = temp[(int)silence_threshold_percentage * number_frames];

   // finally, return audio file of the video's audio stream
   return file;
}

// Functions for API's. Return transcribed text in form of String
String API0(Audio chunk)
{
   // String variable that will store transcription result
   String result;

   // Utilize the API to transcribe this chunk...

   return result;
}
String API1(Audio chunk)
{
   // String variable that will store transcription result
   String result;

   // Utilize the API to transcribe this chunk...

   return result;
}
String API2(Audio chunk)
{
   // String variable that will store transcription result
   String result;

   // Utilize the API to transcribe this chunk...

   return result;
}

// Function that utilizes the randomly chosen API at the beginning to transcribe the audio chunk
String SpeechToText(Audio chunk)
{
   // switch statement for producing the text using whatever API
   // was randomly chosen at the beginning of the program
   switch(used_API)
   {
      case 0:
         return API0(chunk);
      case 1:
         return API1(chunk);
      case 2:
         return API2(chunk);
   }
}

// Function that provides the .srt file for given mp3 argument.
// Return true for success and false for failure
boolean SubtitleGenerator(String AudioFileName)
{
   // Counter for speech segment (used in .srt file)
   int id = 1;

   // Make new file and append to it ".srt" extension
   PrintWriter writer = new PrintWriter(AudioFileName + ".srt", "UTF-8");

   /* go through sound energy array looking for speech segments */
   int i = 0;
   int last_silence = 0;
   int num_silent_frames = 0;
   while (i < number_frames)
   {
      // mark the beginning and ending of speech
      int beginning = i;
      int end = i;

      // As long as there is possible speech...
      while (num_silent_frames <= consecutive_silence_threshold)
      {
         if (sound_energy[i] <= silence_threshold)
         {
            num_silent_frames += 1;
         }
         else
         {
            num_silent_frames = 0;
         }
         end += 1;
         i += 1;
      }

      // mark the end of this silence sequence
      int this_silence = i;

      // If there was possible speech at least length of min_speech_segment...
      if ((end - index) > min_speech_segment && (last_silence - this_silence) > min_distanceTo_silence)
      {
         last_silence = this_silence; // reset last_silence

         Audio chunk = // obtain chunk of audio file correspoding to interval

         String result = SpeechToText(chunk); // obtain transcription of chunk

         /*** write that segment of speech to .srt file ***/

         // first write id of the speech segment
         writer.println(id);

         // convert from frames to milliseconds
         beginning *= size_frame;
         end *= size_frame;

         // calculate start time of this speech segment
         int hours1 = beginning / (60 * 60 * 1000);
         int minutes1 = (beginning / (60 * 1000)) % 60;
         int seconds1 = (beginning / 1000) % 60;
         int millseconds1 = beginning % 1000;

         // calculate end time of this speech segement
         end -= offset;
         int hours2 = end / (60 * 60 * 1000);
         int minutes2 = (end / (60 * 1000)) % 60;
         int seconds2 = (end / 1000) % 60;
         int millseconds2 = end % 1000;

         // write start and end times to .srt file
         writer.println(String.format("%02d", hours1)
            + ":" + String.format("%02d", minutes1)
            + ":" + String.format("%02d", seconds1)
            + "," + String.format("%03d", milliseconds1)
            + " --> " + String.format("%02d", hours21)
            + ":" + String.format("%02d", minutes2)
            + ":" + String.format("%02d", seconds2)
            + "," + String.format("%03d", milliseconds2));

         // write the transcription
         writer.println(string);
         writer.println();

         id += 1; // increase id for next subtitle

         del(chunk); // delete the temporary chunk Audio file
      }
   }

   // close writer (SRT) file
   writer.close();

   del(AudioFileName); // delete temporary mp3 file of audio stream

   return true;
}