Integrating Play3.0 TTS with ChatGPT and other LLMs

This guide will walk you through the process of integrating PlayHT's Play3.0 Text-to-Speech (TTS) voice model with ChatGPT and other Large Language Models (LLMs). By following these steps, you'll be able to create an application that generates audio responses from text-based AI outputs.

Prerequisites

Node.js installed on your system
An OpenAI API key
A PlayHT API key and User ID

Step 1: Set Up Your Project

Create a new directory for your project and navigate to it:

mkdir chatgpt-playht-integration
cd chatgpt-playht-integration

Initialize a new Node.js project:
```
npm init -y
```
Open the package.json file and add the following line to enable ES modules:
```
{
  ...
  "type": "module",
  ...
}
```

Install the required dependencies:

npm install openai playht dotenv express

Step 2: Set Up Environment Variables

Create a .env file in your project root and add your API keys to the .env file:

OPENAI_API_KEY=your_openai_api_key_here
PLAYHT_API_KEY=your_playht_api_key_here
PLAYHT_USER_ID=your_playht_user_id_here

Step 3: Create the ChatGPT Streaming Function

Create a file named streamingGPTText.js with the following content:

import OpenAI from 'openai';
import dotenv from 'dotenv';
import { Readable } from 'node:stream';

dotenv.config();

let openai;
try {
  openai = new OpenAI({
    apiKey: process.env.OPENAI_API_KEY,
  });
} catch (error) {
  console.log('Failed to initialise OpenAI SDK', error.message);
}

export async function streamGptText(prompt) {
  if (!openai) {
    throw 'OpenAI API not initialised';
  }

  const startTime = Date.now();
  let firstByteReceivedTime;

  async function* createStream() {
    const chatGptResponseStream = await openai.chat.completions.create({
      messages: [{ role: 'user', content: prompt }],
      model: 'gpt-3.5-turbo',
      stream: true,
    });

    for await (const part of chatGptResponseStream) {
      if (!firstByteReceivedTime) {
        firstByteReceivedTime = Date.now();
      }
      yield part.choices[0]?.delta?.content || '';
    }
  }

  const stream = Readable.from(createStream());

  return {
    stream,
    getChatGptTTFB: () => firstByteReceivedTime ? firstByteReceivedTime - startTime : null
  };
}

Step 4: Create the Main Application

Create a file named index.js with the following content:

import * as PlayHT from "playht";
import express from "express";
import dotenv from "dotenv";
import { streamGptText } from "./streamingGPTText.js";
import EventEmitter from "events";

dotenv.config();

// Initialize PlayHT SDK
try {
  PlayHT.init({
    apiKey: process.env.PLAYHT_API_KEY,
    userId: process.env.PLAYHT_USER_ID,
  });
} catch (error) {
  console.log("Failed to initialise PlayHT SDK", error.message);
}

// Set the global maximum number of listeners
EventEmitter.defaultMaxListeners = 100;

const app = express();

// Endpoint to convert ChatGPT prompt response into audio stream
app.get("/say-prompt", async (req, res) => {
  try {
    const { prompt } = req.query;
    if (!prompt || typeof prompt !== "string") {
      res.status(400).send("ChatGPT prompt not provided in the request");
      return;
    }

    res.setHeader("Content-Type", "audio/mpeg");

    // Get the stream and TTFB function
    const { stream: gptStream, getChatGptTTFB } = await streamGptText(prompt);

    // Measure TTFB for PlayHT API
    const playHTStartTime = Date.now();
    let playHTTTFBMeasured = false;

    const stream = await PlayHT.stream(gptStream, {
      voiceId: "s3://voice-cloning-zero-shot/801a663f-efd0-4254-98d0-5c175514c3e8/jennifer/manifest.json",
      voiceEngine: "Play3.0",
    });

    // Set the TTFB values as response headers
    stream.once("data", () => {
      if (!playHTTTFBMeasured) {
        const playHTTTFB = Date.now() - playHTStartTime;
        const chatGptTTFB = getChatGptTTFB();
        playHTTTFBMeasured = true;
        console.log(`ChatGPT TTFB: ${chatGptTTFB}ms, PlayHT TTFB: ${playHTTTFB}ms`);
        res.setHeader("X-PlayHT-TTFB", playHTTTFB);
        res.setHeader("X-ChatGPT-TTFB", chatGptTTFB);
      }
    });

    // Pipe response audio stream to browser
    stream.pipe(res);

  } catch (error) {
    console.error("Error:", error);
    res.status(500).send("Internal Server Error");
  }
});

app.listen(3000, () => {
  console.log("Server is running on http://localhost:3000");
});

Step 5: Run Your Application

Start your application:
```
node index.js
```
Your server should now be running on http://localhost:3000.

Step 6: Test the Integration

You can test your integration by sending a GET request to the /say-prompt endpoint with a prompt query parameter. For example:

http://localhost:3000/say-prompt?prompt=Hello, how are you today?

This will return an audio stream of the ChatGPT response being read aloud using the Play3.0 TTS voice model.

Customization

To use a different voice, change the voiceId in the PlayHT.stream() function call.
To use a different LLM, modify the streamGptText function in streamingGPTText.js.

Conclusion

You've now successfully integrated input and output streaming with PlayHT's Play3.0 TTS voice model with ChatGPT. This setup can be adapted to work with other LLMs by modifying the streamGptText function to use the appropriate API calls for your chosen model.