mirror of
https://github.com/algora-io/tv.git
synced 2025-02-14 01:59:50 +02:00
292 lines
4.8 KiB
Plaintext
292 lines
4.8 KiB
Plaintext
|
# COSSgpt
|
||
|
|
||
|
```elixir
|
||
|
import Ecto.Query
|
||
|
import Ecto.Changeset
|
||
|
|
||
|
alias Algora.{Accounts, Library, Repo, Storage, Cache, ML}
|
||
|
|
||
|
IEx.configure(inspect: [charlists: :as_lists])
|
||
|
```
|
||
|
|
||
|
## Section
|
||
|
|
||
|
```elixir
|
||
|
defmodule COSSgpt do
|
||
|
@dir "/home/zaf/Desktop/podcast audio"
|
||
|
|
||
|
alias Algora.Library.Video
|
||
|
|
||
|
def transcribe_video(id, filename) do
|
||
|
video = Library.get_video!(id)
|
||
|
|
||
|
index = ML.load_index!()
|
||
|
|
||
|
IO.puts("\n⌛ processing #{video.title}")
|
||
|
|
||
|
video =
|
||
|
case video.uuid do
|
||
|
nil ->
|
||
|
video |> change() |> Video.put_video_uuid() |> Repo.update!()
|
||
|
|
||
|
_ ->
|
||
|
video
|
||
|
end
|
||
|
|
||
|
slug = Video.slug(video)
|
||
|
|
||
|
IO.puts("- uploading mp3 to tigris")
|
||
|
|
||
|
Cache.fetch("#{slug}/upload", fn ->
|
||
|
Storage.upload_from_filename("#{@dir}/#{filename}", "#{video.uuid}/index.mp3")
|
||
|
:ok
|
||
|
end)
|
||
|
|
||
|
IO.puts("- transcribing audio")
|
||
|
|
||
|
Cache.fetch("#{slug}/transcription", fn ->
|
||
|
transcription = ML.transcribe_video("#{video.url_root}/index.mp3")
|
||
|
|
||
|
:ok =
|
||
|
transcription["chunks"]
|
||
|
|> Enum.map(fn %{"text" => text, "timestamp" => [tstart, tend]} ->
|
||
|
%Library.Subtitle{
|
||
|
body: text,
|
||
|
start: :erlang.float(tstart),
|
||
|
end: :erlang.float(tend || video.duration),
|
||
|
video_id: video.id
|
||
|
}
|
||
|
end)
|
||
|
|> Enum.each(&Repo.insert!/1)
|
||
|
|
||
|
transcription
|
||
|
end)
|
||
|
|
||
|
IO.puts("- chunking transcript")
|
||
|
|
||
|
chunks = ML.chunk(video)
|
||
|
|
||
|
IO.puts("- creating embeddings")
|
||
|
|
||
|
embeddings =
|
||
|
Cache.fetch("#{slug}/embeddings", fn ->
|
||
|
ML.create_embeddings(chunks)
|
||
|
end)
|
||
|
|
||
|
IO.puts("- creating segments")
|
||
|
|
||
|
segments =
|
||
|
Cache.fetch("#{slug}/segments", fn ->
|
||
|
segments =
|
||
|
Enum.zip(chunks, embeddings)
|
||
|
|> Enum.map(fn {chunk, embedding} ->
|
||
|
%Library.Segment{chunk | embedding: embedding["embedding"]}
|
||
|
end)
|
||
|
|> Enum.map(&Repo.insert!/1)
|
||
|
|
||
|
ML.add_embeddings(index, segments)
|
||
|
|
||
|
segments
|
||
|
end)
|
||
|
|
||
|
segments
|
||
|
end
|
||
|
end
|
||
|
```
|
||
|
|
||
|
```elixir
|
||
|
videos = [
|
||
|
{6333, "Supertokens Interview Highlights.mp3"},
|
||
|
{6339, "Hanko full interview.mp3"},
|
||
|
{6407, "OpenBB Full Interview.mp3"},
|
||
|
{6422, "Signoz Interview Highlights.mp3"},
|
||
|
{6390, "Remotion Interview Highlights.mp3"},
|
||
|
{6305, "Nango Interview Highlights.mp3"},
|
||
|
{6347, "Linen Interview Highlights.mp3"},
|
||
|
{6429, "Medplum Full Interview.mp3"},
|
||
|
{6393, "Windmill Interview Highlights.mp3"},
|
||
|
{6419, "Elementary Data Highlights.mp3"},
|
||
|
{6400, "Sematic_Highlights_final.mp3"},
|
||
|
{6300, "Infisical Interview Highlights.mp3"},
|
||
|
{6421, "Novu Interview Highlights.mp3"},
|
||
|
{6330, "Hydra Full Interview.mp3"},
|
||
|
{6309, "shuttle.rs Interview Highlights.mp3"},
|
||
|
{6402, "Ivy Interview Highlights.mp3"},
|
||
|
{6411, "Nextcloud Interview Highlights.mp3"},
|
||
|
{6425, "Okteto Interview Highlights .mp3"},
|
||
|
{6405, "Tigris Data Interview Highlights.mp3"},
|
||
|
{6415, "Refine Interview Highlights.mp3"},
|
||
|
{6413, "Cal.com Interview Highlights.mp3"},
|
||
|
{6302, "Hoppscotch Interview Highlights.mp3"},
|
||
|
{6334, "MAIN-Documenso.mp3"},
|
||
|
{6331, "GiteaFull.mp3"},
|
||
|
{6336, "maybefull.mp3"},
|
||
|
{6391, "NuxtFull.mp3"},
|
||
|
{6327, "OramaFull.mp3"},
|
||
|
{6430, "FinalTimeplusFullInterview.mp3"},
|
||
|
{6295, "tursofull.mp3"},
|
||
|
{6324, "unkeyfull.mp3"},
|
||
|
{8540, "Tauri-Full-Final.mp3"},
|
||
|
{8541, "TailcallFull-correct.mp3"},
|
||
|
{8539, "millionfull.mp3"},
|
||
|
{6426, "Scalarfullinterview.mp3"},
|
||
|
{6387, "IHP Interview Highlights.mp3"},
|
||
|
{8196, "peer-1.mp3"},
|
||
|
{8236, "peer-2.mp3"},
|
||
|
{8412, "peer-3.mp3"},
|
||
|
{8428, "peer-4.mp3"},
|
||
|
{8231, "andreas-1.mp3"},
|
||
|
{8411, "andreas-2.mp3"},
|
||
|
{8426, "andreas-3.mp3"}
|
||
|
]
|
||
|
|
||
|
for {id, filename} <- videos do
|
||
|
COSSgpt.transcribe_video(id, filename)
|
||
|
end
|
||
|
```
|
||
|
|
||
|
```elixir
|
||
|
obsolete_segments = [
|
||
|
937,
|
||
|
938,
|
||
|
939,
|
||
|
940,
|
||
|
941,
|
||
|
942,
|
||
|
932,
|
||
|
933,
|
||
|
934,
|
||
|
935,
|
||
|
936,
|
||
|
1572,
|
||
|
1574,
|
||
|
1575,
|
||
|
1573,
|
||
|
1275,
|
||
|
1276,
|
||
|
1277,
|
||
|
1278,
|
||
|
1279,
|
||
|
1280,
|
||
|
1281,
|
||
|
1282,
|
||
|
1283,
|
||
|
1284,
|
||
|
1289,
|
||
|
1294,
|
||
|
1299,
|
||
|
1285,
|
||
|
1290,
|
||
|
1295,
|
||
|
1300,
|
||
|
1286,
|
||
|
1291,
|
||
|
1296,
|
||
|
1301,
|
||
|
1306,
|
||
|
1311,
|
||
|
1316,
|
||
|
1321,
|
||
|
1326,
|
||
|
1331,
|
||
|
1336,
|
||
|
1341,
|
||
|
1346,
|
||
|
1287,
|
||
|
1292,
|
||
|
1297,
|
||
|
1288,
|
||
|
1293,
|
||
|
1298,
|
||
|
1302,
|
||
|
1303,
|
||
|
1304,
|
||
|
1305,
|
||
|
1307,
|
||
|
1308,
|
||
|
1309,
|
||
|
1310,
|
||
|
1312,
|
||
|
1313,
|
||
|
1314,
|
||
|
1320,
|
||
|
1319,
|
||
|
1324,
|
||
|
1329,
|
||
|
1334,
|
||
|
1339,
|
||
|
1344,
|
||
|
1349,
|
||
|
1354,
|
||
|
1359,
|
||
|
1364,
|
||
|
1369,
|
||
|
1315,
|
||
|
1325,
|
||
|
1330,
|
||
|
1335,
|
||
|
1340,
|
||
|
1345,
|
||
|
1350,
|
||
|
1355,
|
||
|
1360,
|
||
|
1365,
|
||
|
1370,
|
||
|
1382,
|
||
|
1317,
|
||
|
1322,
|
||
|
1327,
|
||
|
1332,
|
||
|
1337,
|
||
|
1342,
|
||
|
1347,
|
||
|
1352,
|
||
|
1357,
|
||
|
1362,
|
||
|
1367,
|
||
|
1372,
|
||
|
1377,
|
||
|
1318,
|
||
|
1323,
|
||
|
1328,
|
||
|
1333,
|
||
|
1338,
|
||
|
1343,
|
||
|
1348,
|
||
|
1353,
|
||
|
1358,
|
||
|
1363,
|
||
|
1368,
|
||
|
1373,
|
||
|
1351,
|
||
|
1356,
|
||
|
1361,
|
||
|
1366,
|
||
|
1371,
|
||
|
1376,
|
||
|
1381,
|
||
|
1386,
|
||
|
1391,
|
||
|
1374,
|
||
|
1379,
|
||
|
1384,
|
||
|
1389,
|
||
|
1375,
|
||
|
1380,
|
||
|
1385,
|
||
|
1390,
|
||
|
1378,
|
||
|
1383,
|
||
|
1388,
|
||
|
1387
|
||
|
]
|
||
|
|
||
|
index = ML.load_index!()
|
||
|
|
||
|
for id <- obsolete_segments do
|
||
|
:ok = HNSWLib.Index.mark_deleted(index, id)
|
||
|
end
|
||
|
|
||
|
ML.save_index(index)
|
||
|
```
|