Rewrite transcript logic to be more generic (#4747)
The transcript logic in Invidious was written specifically as a workaround for captions, and not transcripts as a feature. This PR genericises the logic as so it can be used to implement transcripts within Invidious. The most notable change is the added parsing of section headings when it was previously skipped over in favor of regular lines.
This commit is contained in:
commit
a56a724a55
|
@ -89,9 +89,14 @@ module Invidious::Routes::API::V1::Videos
|
||||||
|
|
||||||
if CONFIG.use_innertube_for_captions
|
if CONFIG.use_innertube_for_captions
|
||||||
params = Invidious::Videos::Transcript.generate_param(id, caption.language_code, caption.auto_generated)
|
params = Invidious::Videos::Transcript.generate_param(id, caption.language_code, caption.auto_generated)
|
||||||
initial_data = YoutubeAPI.get_transcript(params)
|
|
||||||
|
|
||||||
webvtt = Invidious::Videos::Transcript.convert_transcripts_to_vtt(initial_data, caption.language_code)
|
transcript = Invidious::Videos::Transcript.from_raw(
|
||||||
|
YoutubeAPI.get_transcript(params),
|
||||||
|
caption.language_code,
|
||||||
|
caption.auto_generated
|
||||||
|
)
|
||||||
|
|
||||||
|
webvtt = transcript.to_vtt
|
||||||
else
|
else
|
||||||
# Timedtext API handling
|
# Timedtext API handling
|
||||||
url = URI.parse("#{caption.base_url}&tlang=#{tlang}").request_target
|
url = URI.parse("#{caption.base_url}&tlang=#{tlang}").request_target
|
||||||
|
|
|
@ -1,8 +1,26 @@
|
||||||
module Invidious::Videos
|
module Invidious::Videos
|
||||||
# Namespace for methods primarily relating to Transcripts
|
# A `Transcripts` struct encapsulates a sequence of lines that together forms the whole transcript for a given YouTube video.
|
||||||
module Transcript
|
# These lines can be categorized into two types: section headings and regular lines representing content from the video.
|
||||||
record TranscriptLine, start_ms : Time::Span, end_ms : Time::Span, line : String
|
struct Transcript
|
||||||
|
# Types
|
||||||
|
record HeadingLine, start_ms : Time::Span, end_ms : Time::Span, line : String
|
||||||
|
record RegularLine, start_ms : Time::Span, end_ms : Time::Span, line : String
|
||||||
|
alias TranscriptLine = HeadingLine | RegularLine
|
||||||
|
|
||||||
|
property lines : Array(TranscriptLine)
|
||||||
|
|
||||||
|
property language_code : String
|
||||||
|
property auto_generated : Bool
|
||||||
|
|
||||||
|
# User friendly label for the current transcript.
|
||||||
|
# Example: "English (auto-generated)"
|
||||||
|
property label : String
|
||||||
|
|
||||||
|
# Initializes a new Transcript struct with the contents and associated metadata describing it
|
||||||
|
def initialize(@lines : Array(TranscriptLine), @language_code : String, @auto_generated : Bool, @label : String)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Generates a protobuf string to fetch the requested transcript from YouTube
|
||||||
def self.generate_param(video_id : String, language_code : String, auto_generated : Bool) : String
|
def self.generate_param(video_id : String, language_code : String, auto_generated : Bool) : String
|
||||||
kind = auto_generated ? "asr" : ""
|
kind = auto_generated ? "asr" : ""
|
||||||
|
|
||||||
|
@ -30,48 +48,79 @@ module Invidious::Videos
|
||||||
return params
|
return params
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.convert_transcripts_to_vtt(initial_data : Hash(String, JSON::Any), target_language : String) : String
|
# Constructs a Transcripts struct from the initial YouTube response
|
||||||
# Convert into array of TranscriptLine
|
def self.from_raw(initial_data : Hash(String, JSON::Any), language_code : String, auto_generated : Bool)
|
||||||
lines = self.parse(initial_data)
|
transcript_panel = initial_data.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer",
|
||||||
|
"content", "transcriptSearchPanelRenderer")
|
||||||
|
|
||||||
|
segment_list = transcript_panel.dig("body", "transcriptSegmentListRenderer")
|
||||||
|
|
||||||
|
if !segment_list["initialSegments"]?
|
||||||
|
raise NotFoundException.new("Requested transcript does not exist")
|
||||||
|
end
|
||||||
|
|
||||||
|
# Extract user-friendly label for the current transcript
|
||||||
|
|
||||||
|
footer_language_menu = transcript_panel.dig?(
|
||||||
|
"footer", "transcriptFooterRenderer", "languageMenu", "sortFilterSubMenuRenderer", "subMenuItems"
|
||||||
|
)
|
||||||
|
|
||||||
|
if footer_language_menu
|
||||||
|
label = footer_language_menu.as_a.select(&.["selected"].as_bool)[0]["title"].as_s
|
||||||
|
else
|
||||||
|
label = language_code
|
||||||
|
end
|
||||||
|
|
||||||
|
# Extract transcript lines
|
||||||
|
|
||||||
|
initial_segments = segment_list["initialSegments"].as_a
|
||||||
|
|
||||||
|
lines = [] of TranscriptLine
|
||||||
|
|
||||||
|
initial_segments.each do |line|
|
||||||
|
if unpacked_line = line["transcriptSectionHeaderRenderer"]?
|
||||||
|
line_type = HeadingLine
|
||||||
|
else
|
||||||
|
unpacked_line = line["transcriptSegmentRenderer"]
|
||||||
|
line_type = RegularLine
|
||||||
|
end
|
||||||
|
|
||||||
|
start_ms = unpacked_line["startMs"].as_s.to_i.millisecond
|
||||||
|
end_ms = unpacked_line["endMs"].as_s.to_i.millisecond
|
||||||
|
text = extract_text(unpacked_line["snippet"]) || ""
|
||||||
|
|
||||||
|
lines << line_type.new(start_ms, end_ms, text)
|
||||||
|
end
|
||||||
|
|
||||||
|
return Transcript.new(
|
||||||
|
lines: lines,
|
||||||
|
language_code: language_code,
|
||||||
|
auto_generated: auto_generated,
|
||||||
|
label: label
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Converts transcript lines to a WebVTT file
|
||||||
|
#
|
||||||
|
# This is used within Invidious to replace subtitles
|
||||||
|
# as to workaround YouTube's rate-limited timedtext endpoint.
|
||||||
|
def to_vtt
|
||||||
settings_field = {
|
settings_field = {
|
||||||
"Kind" => "captions",
|
"Kind" => "captions",
|
||||||
"Language" => target_language,
|
"Language" => @language_code,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Taken from Invidious::Videos::Captions::Metadata.timedtext_to_vtt()
|
|
||||||
vtt = WebVTT.build(settings_field) do |vtt|
|
vtt = WebVTT.build(settings_field) do |vtt|
|
||||||
lines.each do |line|
|
@lines.each do |line|
|
||||||
|
# Section headers are excluded from the VTT conversion as to
|
||||||
|
# match the regular captions returned from YouTube as much as possible
|
||||||
|
next if line.is_a? HeadingLine
|
||||||
|
|
||||||
vtt.cue(line.start_ms, line.end_ms, line.line)
|
vtt.cue(line.start_ms, line.end_ms, line.line)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
return vtt
|
return vtt
|
||||||
end
|
end
|
||||||
|
|
||||||
private def self.parse(initial_data : Hash(String, JSON::Any))
|
|
||||||
body = initial_data.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer",
|
|
||||||
"content", "transcriptSearchPanelRenderer", "body", "transcriptSegmentListRenderer",
|
|
||||||
"initialSegments").as_a
|
|
||||||
|
|
||||||
lines = [] of TranscriptLine
|
|
||||||
body.each do |line|
|
|
||||||
# Transcript section headers. They are not apart of the captions and as such we can safely skip them.
|
|
||||||
if line.as_h.has_key?("transcriptSectionHeaderRenderer")
|
|
||||||
next
|
|
||||||
end
|
|
||||||
|
|
||||||
line = line["transcriptSegmentRenderer"]
|
|
||||||
|
|
||||||
start_ms = line["startMs"].as_s.to_i.millisecond
|
|
||||||
end_ms = line["endMs"].as_s.to_i.millisecond
|
|
||||||
|
|
||||||
text = extract_text(line["snippet"]) || ""
|
|
||||||
|
|
||||||
lines << TranscriptLine.new(start_ms, end_ms, text)
|
|
||||||
end
|
|
||||||
|
|
||||||
return lines
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in a new issue