@@ -7,6 +7,7 @@ defmodule CF.Videos.CaptionsFetcherYoutube do
7
7
@ behaviour CF.Videos.CaptionsFetcher
8
8
9
9
require Logger
10
+ import SweetXml
10
11
11
12
@ impl true
12
13
def fetch ( % { youtube_id: youtube_id , language: language } ) do
@@ -69,31 +70,29 @@ defmodule CF.Videos.CaptionsFetcherYoutube do
69
70
70
71
defp process_transcript ( transcript ) do
71
72
transcript
72
- |> String . replace ( ~r/ ^<\? xml version="1.0" encoding="utf-8"\? ><transcript>/ , "" )
73
- |> String . replace ( "</transcript>" , "" )
74
- |> String . split ( "</text>" )
75
- |> Enum . filter ( & ( String . trim ( & 1 ) != "" ) )
76
- |> Enum . map ( & process_line / 1 )
73
+ |> SweetXml . xpath (
74
+ ~x" //transcript/text" l ,
75
+ text: ~x" ./text()" s |> transform_by ( & clean_text / 1 ) ,
76
+ start: ~x" ./@start" s |> transform_by ( & parse_float / 1 ) ,
77
+ duration: ~x" ./@dur" s |> transform_by ( & parse_float / 1 )
78
+ )
79
+ |> Enum . filter ( fn % { text: text , start: start } ->
80
+ start != nil and text != nil and text != ""
81
+ end )
77
82
end
78
83
79
- defp process_line ( line ) do
80
- % { "start" => start } = Regex . named_captures ( ~r/ start="(?<start>[\d .]+)"/ , line )
81
- % { "dur" => dur } = Regex . named_captures ( ~r/ dur="(?<dur>[\d .]+)"/ , line )
82
-
83
- text =
84
- line
85
- |> String . replace ( "&" , "&" )
86
- |> String . replace ( ~r/ <text.+>/ , "" )
87
- |> String . replace ( ~r" </?[^>]+(>|$)" , "" )
88
- |> HtmlEntities . decode ( )
89
- |> String . trim ( )
90
-
91
- % { start: parse_float ( start ) , duration: parse_float ( dur ) , text: text }
84
+ defp clean_text ( text ) do
85
+ text
86
+ |> String . replace ( "&" , "&" )
87
+ |> HtmlEntities . decode ( )
88
+ |> String . trim ( )
92
89
end
93
90
94
91
defp parse_float ( val ) do
95
- { num , _ } = Float . parse ( val )
96
- num
92
+ case Float . parse ( val ) do
93
+ { num , _ } -> num
94
+ _ -> nil
95
+ end
97
96
end
98
97
99
98
# Below is an implementation using the official YouTube API, but it requires OAuth2 authentication.
0 commit comments