Skip to content

Commit 64de883

Browse files
committed
fix(Captions): use proper XML parsing
1 parent c7efee0 commit 64de883

File tree

2 files changed

+20
-20
lines changed

2 files changed

+20
-20
lines changed

apps/cf/lib/videos/captions_fetcher_youtube.ex

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ defmodule CF.Videos.CaptionsFetcherYoutube do
77
@behaviour CF.Videos.CaptionsFetcher
88

99
require Logger
10+
import SweetXml
1011

1112
@impl true
1213
def fetch(%{youtube_id: youtube_id, language: language}) do
@@ -69,31 +70,29 @@ defmodule CF.Videos.CaptionsFetcherYoutube do
6970

7071
defp process_transcript(transcript) do
7172
transcript
72-
|> String.replace(~r/^<\?xml version="1.0" encoding="utf-8"\?><transcript>/, "")
73-
|> String.replace("</transcript>", "")
74-
|> String.split("</text>")
75-
|> Enum.filter(&(String.trim(&1) != ""))
76-
|> Enum.map(&process_line/1)
73+
|> SweetXml.xpath(
74+
~x"//transcript/text"l,
75+
text: ~x"./text()"s |> transform_by(&clean_text/1),
76+
start: ~x"./@start"s |> transform_by(&parse_float/1),
77+
duration: ~x"./@dur"s |> transform_by(&parse_float/1)
78+
)
79+
|> Enum.filter(fn %{text: text, start: start} ->
80+
start != nil and text != nil and text != ""
81+
end)
7782
end
7883

79-
defp process_line(line) do
80-
%{"start" => start} = Regex.named_captures(~r/start="(?<start>[\d.]+)"/, line)
81-
%{"dur" => dur} = Regex.named_captures(~r/dur="(?<dur>[\d.]+)"/, line)
82-
83-
text =
84-
line
85-
|> String.replace("&amp;", "&")
86-
|> String.replace(~r/<text.+>/, "")
87-
|> String.replace(~r"</?[^>]+(>|$)", "")
88-
|> HtmlEntities.decode()
89-
|> String.trim()
90-
91-
%{start: parse_float(start), duration: parse_float(dur), text: text}
84+
defp clean_text(text) do
85+
text
86+
|> String.replace("&amp;", "&")
87+
|> HtmlEntities.decode()
88+
|> String.trim()
9289
end
9390

9491
defp parse_float(val) do
95-
{num, _} = Float.parse(val)
96-
num
92+
case Float.parse(val) do
93+
{num, _} -> num
94+
_ -> nil
95+
end
9796
end
9897

9998
# Below is an implementation using the official YouTube API, but it requires OAuth2 authentication.

apps/cf/mix.exs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ defmodule CF.Mixfile do
6060
{:yaml_elixir, "~> 2.9.0"},
6161
{:jason, "~> 1.4"},
6262
{:openai, "~> 0.6.1"},
63+
{:sweet_xml, "~> 0.7.4"},
6364

6465
# ---- Internal ----
6566
{:db, in_umbrella: true},

0 commit comments

Comments
 (0)