diff --git a/internal/tools/youtube/timestamp_test.go b/internal/tools/youtube/timestamp_test.go new file mode 100644 index 00000000..34f23c6f --- /dev/null +++ b/internal/tools/youtube/timestamp_test.go @@ -0,0 +1,60 @@ +package youtube + +import ( + "testing" +) + +func TestParseTimestampToSeconds(t *testing.T) { + tests := []struct { + timestamp string + expected int + shouldErr bool + }{ + {"00:30", 30, false}, + {"01:30", 90, false}, + {"01:05:30", 3930, false}, // 1 hour 5 minutes 30 seconds + {"10:00", 600, false}, + {"invalid", 0, true}, + {"1:2:3:4", 0, true}, // too many parts + } + + for _, test := range tests { + result, err := parseTimestampToSeconds(test.timestamp) + + if test.shouldErr { + if err == nil { + t.Errorf("Expected error for timestamp %s, but got none", test.timestamp) + } + } else { + if err != nil { + t.Errorf("Unexpected error for timestamp %s: %v", test.timestamp, err) + } + if result != test.expected { + t.Errorf("For timestamp %s, expected %d seconds, got %d", test.timestamp, test.expected, result) + } + } + } +} + +func TestShouldIncludeRepeat(t *testing.T) { + tests := []struct { + lastTimestamp string + currentTimestamp string + expected bool + description string + }{ + {"00:30", "01:30", true, "60 second gap should allow repeat"}, + {"00:30", "00:45", false, "15 second gap should not allow repeat"}, + {"01:00", "01:30", true, "30 second gap should allow repeat (boundary case)"}, + {"01:00", "01:29", false, "29 second gap should not allow repeat"}, + {"invalid", "01:30", true, "invalid timestamp should err on side of inclusion"}, + {"01:30", "invalid", true, "invalid timestamp should err on side of inclusion"}, + } + + for _, test := range tests { + result := shouldIncludeRepeat(test.lastTimestamp, test.currentTimestamp) + if result != test.expected { + t.Errorf("%s: expected %v, got %v", test.description, test.expected, result) + } + } +} diff --git a/internal/tools/youtube/youtube.go b/internal/tools/youtube/youtube.go index f5bac86f..ac8a5cf3 100644 --- a/internal/tools/youtube/youtube.go +++ b/internal/tools/youtube/youtube.go @@ -30,7 +30,9 @@ import ( ) // Match timestamps like "00:00:01.234" or just numbers or sequence numbers -var timestampRegex = regexp.MustCompile(`^\d+$|^\d{1,2}:\d{2}(:\d{2})?(\\.\d{3})?$`) +var timestampRegex = regexp.MustCompile(`^\d+$|^\d{1,2}:\d{2}(:\d{2})?(\.\d{3})?$`) + +const TimeGapForRepeats = 10 // seconds func NewYouTube() (ret *YouTube) { @@ -222,7 +224,10 @@ func (o *YouTube) readAndFormatVTTWithTimestamps(filename string) (ret string, e lines := strings.Split(string(content), "\n") var textBuilder strings.Builder var currentTimestamp string - seenSegments := make(map[string]struct{}) + // Track content with timestamps to allow repeats after significant time gaps + // This preserves legitimate repeated content (choruses, recurring phrases, etc.) + // while still filtering out immediate duplicates from VTT formatting issues + seenSegments := make(map[string]string) // text -> last timestamp seen for _, line := range lines { line = strings.TrimSpace(line) @@ -254,11 +259,19 @@ func (o *YouTube) readAndFormatVTTWithTimestamps(filename string) (ret string, e // Remove VTT formatting tags cleanText := removeVTTTags(line) if cleanText != "" && currentTimestamp != "" { - // Use just the clean text as the key to avoid duplicates across different timestamps - if _, exists := seenSegments[cleanText]; !exists { + // Check if we should include this segment + shouldInclude := true + if lastTimestamp, exists := seenSegments[cleanText]; exists { + // Calculate time difference to determine if this is a legitimate repeat + if !shouldIncludeRepeat(lastTimestamp, currentTimestamp) { + shouldInclude = false + } + } + + if shouldInclude { timestampedLine := fmt.Sprintf("[%s] %s", currentTimestamp, cleanText) textBuilder.WriteString(timestampedLine + "\n") - seenSegments[cleanText] = struct{}{} + seenSegments[cleanText] = currentTimestamp } } } @@ -290,6 +303,59 @@ func removeVTTTags(s string) string { return tagRegex.ReplaceAllString(s, "") } +// shouldIncludeRepeat determines if repeated content should be included based on time gap +func shouldIncludeRepeat(lastTimestamp, currentTimestamp string) bool { + // Parse timestamps to calculate time difference + lastSeconds, err1 := parseTimestampToSeconds(lastTimestamp) + currentSeconds, err2 := parseTimestampToSeconds(currentTimestamp) + + if err1 != nil || err2 != nil { + // If we can't parse timestamps, err on the side of inclusion + return true + } + + // Allow repeats if there's at least a TimeGapForRepeats gap + // This threshold can be adjusted based on use case: + // - 10 seconds works well for most content + // - Could be made configurable in the future + timeDiffSeconds := currentSeconds - lastSeconds + return timeDiffSeconds >= TimeGapForRepeats +} + +// parseTimestampToSeconds converts timestamp string (HH:MM:SS or MM:SS) to total seconds +func parseTimestampToSeconds(timestamp string) (int, error) { + parts := strings.Split(timestamp, ":") + if len(parts) < 2 || len(parts) > 3 { + return 0, fmt.Errorf("invalid timestamp format: %s", timestamp) + } + + var hours, minutes, seconds int + var err error + + if len(parts) == 3 { + // HH:MM:SS format + if hours, err = strconv.Atoi(parts[0]); err != nil { + return 0, err + } + if minutes, err = strconv.Atoi(parts[1]); err != nil { + return 0, err + } + if seconds, err = strconv.Atoi(parts[2]); err != nil { + return 0, err + } + } else { + // MM:SS format + if minutes, err = strconv.Atoi(parts[0]); err != nil { + return 0, err + } + if seconds, err = strconv.Atoi(parts[1]); err != nil { + return 0, err + } + } + + return hours*3600 + minutes*60 + seconds, nil +} + func (o *YouTube) GrabComments(videoId string) (ret []string, err error) { if err = o.initService(); err != nil { return