`createTranscript`

const functions = require('firebase-functions');
const admin = require('firebase-admin');
const path = require('path');
const os = require('os');
const fs = require('fs');
const ffmpeg = require('fluent-ffmpeg');


const AUDIO_EXTENSION = 'ogg';
const SAMPLE_RATE_HERTZ = 16000; // or 48000
const MAX_RUNTIME_OPTS = {
  timeoutSeconds: 540, // 9 minutes
  memory: '2GB',
};


const addMinutes = (date, minutes) => {
  return new Date(date.getTime() + minutes * 60000);
};

const getSecondsSinceEpoch = date => {
  return Math.round(date.getTime() / 1000);
};

/**
* Helper funciton to conver video and audio media to audio that meets GCP STT specs 
*/
const convertToAudio = async (admin, storageRef, downloadURLLink, AUDIO_EXTENSION, SAMPLE_RATE_HERTZ) => {
  return new Promise(async (resolve, reject) => {
    // Get the file name.
    const fileName = path.basename(storageRef);
    const filePath = storageRef;
    // Exit if the audio is already converted.
    if (fileName.endsWith(`_output.${AUDIO_EXTENSION}`)) {
      console.error('Already a converted audio.');
      // return null;
      reject(new Error());
    }

    const bucket = admin.storage().bucket();
    // We add a '_output.flac' suffix to target audio file name. That's where we'll upload the converted audio.
    const targetTempFileName = fileName.replace(/\.[^/.]+$/, '') + `_output.${AUDIO_EXTENSION}`;
    const targetTempFilePath = path.join(os.tmpdir(), targetTempFileName);
    const targetStorageFilePath = path.join(path.dirname(filePath), targetTempFileName);

    ffmpeg(downloadURLLink)
      .noVideo()
      .audioChannels(1)
      .audioFrequency(SAMPLE_RATE_HERTZ)
      .audioCodec('libopus')
      .output(targetTempFilePath)
      .on('end', async () => {
        // Uploading the audio Google Cloud Storage
        await bucket.upload(targetTempFilePath, {
          destination: targetStorageFilePath,
          // without resumable false, this seems to fail
          resumable: false,
        });
        // Once the audio has been uploaded delete the local file to free up disk space.
        fs.unlinkSync(targetTempFilePath);
        resolve(targetStorageFilePath);
      })
      .on('error', err => {
        reject(err);
      })
      .run();
  });
};


// TODO: Google cloud function triggers Goolge Cloud task
//  Goolge Cloud task calls cloud function that calls STT SDK
// this function returns null.
// timeout 1min to 9min https://cloud.google.com/functions/docs/concepts/exec#timeout
// https://firebase.google.com/docs/functions/firestore-events
exports.createTranscript = functions
  .runWith(MAX_RUNTIME_OPTS)
  .firestore.document('projects/{projectId}/transcripts/{transcriptId}')
  .onCreate(async (change, context) => {
    // Get an object representing the document
    const newValue = change.data();
    // access a particular field as you would any JS property
    let storageRef = newValue.storageRefName;
    const downloadURLLink = newValue.downloadURL;
    // https://firebase.google.com/docs/storage/admin/start
    const storage = admin.storage();
    // https://github.com/firebase/firebase-tools/issues/1573#issuecomment-517000981
    const bucket = storage.appInternal.options.storageBucket;
  
    // Convert video or audio to audio that meets GCP STT Specs
    const audioForSttRef = await convertToAudio(admin, storageRef, downloadURLLink, AUDIO_EXTENSION, SAMPLE_RATE_HERTZ);
    // save audio ref to firestore
    change.ref.set(
      {
        audioUrl: audioForSttRef,
      },
      {
        merge: true,
      }
    );
  
    // STT
    const client = new speech.SpeechClient();
    const request = {
      config: {
        // https://cloud.google.com/speech-to-text/docs/async-time-offsets
        enableWordTimeOffsets: true,
        // https://cloud.google.com/speech-to-text/docs/automatic-punctuation
        enableAutomaticPunctuation: true,
        // https://cloud.google.com/speech-to-text/docs/multiple-voices
        enableSpeakerDiarization: true,
        diarizationConfig: {
          enableSpeakerDiarization: true,
          //  If not set, the default value is 2.
          // minSpeakerCount: 2,
          //  If not set, the default value is 6.
          // maxSpeakerCount: 3,
        },
        encoding: 'OGG_OPUS',
        // in RecognitionConfig must either be unspecified or match the value in the FLAC header `16000`;
        sampleRateHertz: Number(SAMPLE_RATE_HERTZ).toString(),
        languageCode: 'en-US',
        // https://cloud.google.com/speech-to-text/docs/multiple-languages
        // alternativeLanguageCodes: ['es-ES', 'en-US'],
        // https://cloud.google.com/speech-to-text/docs/reference/rest/v1p1beta1/RecognitionConfig
        // model: 'video'
      },
      audio: {
        uri: `gs://${bucket}/${audioForSttRef}`,
      },
    };
  
    // This creates a recognition job that you can wait for now, or get its result later.
    // initialApiResponse.name is the operation name/"id"
    // initialApiResponse.done is the status of the operation
    const [operation, initialApiResponse] = await client.longRunningRecognize(request);
    console.log('initialApiResponse', initialApiResponse.name);
  
    const sttOperationName = initialApiResponse.name;
    const sttOperationStatus = initialApiResponse.done;
  
    // TODO: I don't think the first response will have just have the results as is?
    if (sttOperationStatus && initialApiResponse.response && initialApiResponse.response.results) {
      //  const [response] = await operation.promise();
      const transcript = gcpToDpe(initialApiResponse.response);
      const { paragraphs, words } = transcript;
      return change.ref.set(
        {
          paragraphs,
          words,
          status: 'done',
        },
        {
          merge: true,
        }
      );
    } else {
      // Start a cloud task that triggers cloud function to check progress of GCP STT operation at latest stage
      const project = admin.instanceId().app.options.projectId;
      // https://firebase.google.com/docs/functions/locations
      const location = 'us-central1';
      const queue = 'firestore-stt';
  
      const tasksClient = new CloudTasksClient();
      const queuePath = tasksClient.queuePath(project, location, queue);
  
      const url = `https://${location}-${project}.cloudfunctions.net/firestoreCheckSTT`;
      console.log('url firestoreCheckSTT', url);
      const docPath = change.ref.path;
  
      const payload = { sttOperationName, docPath };
  
      // time of expiration expressed in epoch seconds
      const now = new Date();
      const timeFromNowWhenToCheckAgainInMinutes = 5;
      const timeFromNowWhenToCheckAgainAsDate = addMinutes(now, timeFromNowWhenToCheckAgainInMinutes);
      // Epoch, also known as Unix timestamps, is the number of seconds (not milliseconds!) that have elapsed since January 1, 1970 at 00:00:00 GMT
      const secondsSinceEpoch = getSecondsSinceEpoch(timeFromNowWhenToCheckAgainAsDate);
  
      // For troubleshooting
      change.ref.set(
        {
          sttOperationName,
          nextSttProgressCheckAt: timeFromNowWhenToCheckAgainAsDate,
        },
        {
          merge: true,
        }
      );
  
      const task = {
        httpRequest: {
          httpMethod: 'POST',
          url,
          body: Buffer.from(JSON.stringify(payload)).toString('base64'),
          headers: {
            'Content-Type': 'application/json',
          },
        },
        scheduleTime: {
          seconds: secondsSinceEpoch,
        },
      };
      const [response] = await tasksClient.createTask({ parent: queuePath, task });
      console.log(`Created task ${response.name}`);
      return null;
    }
  });

Last updated