fix(backend): Fix Youtube blocking our cloud ips (#11456)

Youtube can blocks cloud ips causing the youtube transcribe blocks to
not work. This PR adds webshare proxy to get around this issue

### Changes 🏗️

- add webshare proxy to youtube transcribe block 

### Checklist 📋

#### For code changes:
- [x] I have clearly listed my changes in the PR description
- [x] I have made a test plan
- [x] I have tested my changes according to the test plan:
  <!-- Put your test plan here: -->
  - [x] I have tested this works locally using the proxy

<!-- CURSOR_SUMMARY -->
---

> [!NOTE]
> Routes YouTube transcript fetching through Webshare proxy using
user/password credentials, wiring in provider enum, settings, default
credentials, and updated tests.
> 
> - **Blocks** (`backend/blocks/youtube.py`):
> - Use `WebshareProxyConfig` with `YouTubeTranscriptApi` to fetch
transcripts via proxy.
> - Add `credentials` input (`user_password` for `webshare_proxy`);
include test credentials and mocks.
> - Update method signatures: `get_transcript(video_id, credentials)`
and `run(..., *, credentials, ...)`.
>   - Change description to indicate proxy usage; add logging.
> - **Integrations**:
> - Providers (`backend/integrations/providers.py`): add
`ProviderName.WEBSHARE_PROXY`.
> - Credentials store (`backend/integrations/credentials_store.py`): add
`webshare_proxy` `UserPasswordCredentials`; include in
`DEFAULT_CREDENTIALS` and conditionally in `get_all_creds`.
> - **Settings** (`backend/util/settings.py`): add secrets
`webshare_proxy_username` and `webshare_proxy_password`.
> - **Tests** (`test/blocks/test_youtube.py`): update to pass
credentials and assert proxy config; add custom-credentials test; adjust
fallback/priority tests.
> 
> <sup>Written by [Cursor
Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit
d060898488. This will update automatically
on new commits. Configure
[here](https://cursor.com/dashboard?tab=bugbot).</sup>
<!-- /CURSOR_SUMMARY -->

Co-authored-by: Nicholas Tindle <nicholas.tindle@agpt.co>
This commit is contained in:
Swifty
2025-12-01 20:54:52 +01:00
committed by GitHub
parent 0728f3bd49
commit 7d53c0de27
5 changed files with 136 additions and 14 deletions

View File

@@ -1,9 +1,13 @@
import logging
from typing import Literal
from urllib.parse import parse_qs, urlparse
from pydantic import SecretStr
from youtube_transcript_api._api import YouTubeTranscriptApi
from youtube_transcript_api._errors import NoTranscriptFound
from youtube_transcript_api._transcripts import FetchedTranscript
from youtube_transcript_api.formatters import TextFormatter
from youtube_transcript_api.proxies import WebshareProxyConfig
from backend.data.block import (
Block,
@@ -12,7 +16,42 @@ from backend.data.block import (
BlockSchemaInput,
BlockSchemaOutput,
)
from backend.data.model import SchemaField
from backend.data.model import (
CredentialsField,
CredentialsMetaInput,
SchemaField,
UserPasswordCredentials,
)
from backend.integrations.providers import ProviderName
logger = logging.getLogger(__name__)
TEST_CREDENTIALS = UserPasswordCredentials(
id="01234567-89ab-cdef-0123-456789abcdef",
provider="webshare_proxy",
username=SecretStr("mock-webshare-username"),
password=SecretStr("mock-webshare-password"),
title="Mock Webshare Proxy credentials",
)
TEST_CREDENTIALS_INPUT = {
"provider": TEST_CREDENTIALS.provider,
"id": TEST_CREDENTIALS.id,
"type": TEST_CREDENTIALS.type,
"title": TEST_CREDENTIALS.title,
}
WebshareProxyCredentials = UserPasswordCredentials
WebshareProxyCredentialsInput = CredentialsMetaInput[
Literal[ProviderName.WEBSHARE_PROXY],
Literal["user_password"],
]
def WebshareProxyCredentialsField() -> WebshareProxyCredentialsInput:
return CredentialsField(
description="Webshare proxy credentials for fetching YouTube transcripts",
)
class TranscribeYoutubeVideoBlock(Block):
@@ -22,6 +61,7 @@ class TranscribeYoutubeVideoBlock(Block):
description="The URL of the YouTube video to transcribe",
placeholder="https://www.youtube.com/watch?v=dQw4w9WgXcQ",
)
credentials: WebshareProxyCredentialsInput = WebshareProxyCredentialsField()
class Output(BlockSchemaOutput):
video_id: str = SchemaField(description="The extracted YouTube video ID")
@@ -35,9 +75,12 @@ class TranscribeYoutubeVideoBlock(Block):
id="f3a8f7e1-4b1d-4e5f-9f2a-7c3d5a2e6b4c",
input_schema=TranscribeYoutubeVideoBlock.Input,
output_schema=TranscribeYoutubeVideoBlock.Output,
description="Transcribes a YouTube video.",
description="Transcribes a YouTube video using a proxy.",
categories={BlockCategory.SOCIAL},
test_input={"youtube_url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"},
test_input={
"youtube_url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
"credentials": TEST_CREDENTIALS_INPUT,
},
test_output=[
("video_id", "dQw4w9WgXcQ"),
(
@@ -45,8 +88,9 @@ class TranscribeYoutubeVideoBlock(Block):
"Never gonna give you up\nNever gonna let you down",
),
],
test_credentials=TEST_CREDENTIALS,
test_mock={
"get_transcript": lambda video_id: [
"get_transcript": lambda video_id, credentials: [
{"text": "Never gonna give you up"},
{"text": "Never gonna let you down"},
],
@@ -69,16 +113,27 @@ class TranscribeYoutubeVideoBlock(Block):
return parsed_url.path.split("/")[2]
raise ValueError(f"Invalid YouTube URL: {url}")
@staticmethod
def get_transcript(video_id: str) -> FetchedTranscript:
def get_transcript(
self, video_id: str, credentials: WebshareProxyCredentials
) -> FetchedTranscript:
"""
Get transcript for a video, preferring English but falling back to any available language.
:param video_id: The YouTube video ID
:param credentials: The Webshare proxy credentials
:return: The fetched transcript
:raises: Any exception except NoTranscriptFound for requested languages
"""
api = YouTubeTranscriptApi()
logger.warning(
"Using Webshare proxy for YouTube transcript fetch (video_id=%s)",
video_id,
)
proxy_config = WebshareProxyConfig(
proxy_username=credentials.username.get_secret_value(),
proxy_password=credentials.password.get_secret_value(),
)
api = YouTubeTranscriptApi(proxy_config=proxy_config)
try:
# Try to get English transcript first (default behavior)
return api.fetch(video_id=video_id)
@@ -101,11 +156,17 @@ class TranscribeYoutubeVideoBlock(Block):
transcript_text = formatter.format_transcript(transcript)
return transcript_text
async def run(self, input_data: Input, **kwargs) -> BlockOutput:
async def run(
self,
input_data: Input,
*,
credentials: WebshareProxyCredentials,
**kwargs,
) -> BlockOutput:
video_id = self.extract_video_id(input_data.youtube_url)
yield "video_id", video_id
transcript = self.get_transcript(video_id)
transcript = self.get_transcript(video_id, credentials)
transcript_text = self.format_transcript(transcript=transcript)
yield "transcript", transcript_text

View File

@@ -15,6 +15,7 @@ from backend.data.model import (
OAuth2Credentials,
OAuthState,
UserIntegrations,
UserPasswordCredentials,
)
from backend.data.redis_client import get_redis_async
from backend.util.settings import Settings
@@ -207,6 +208,14 @@ v0_credentials = APIKeyCredentials(
expires_at=None,
)
webshare_proxy_credentials = UserPasswordCredentials(
id="a5b3c7d9-2e4f-4a6b-8c1d-9e0f1a2b3c4d",
provider="webshare_proxy",
username=SecretStr(settings.secrets.webshare_proxy_username),
password=SecretStr(settings.secrets.webshare_proxy_password),
title="Use Credits for Webshare Proxy",
)
DEFAULT_CREDENTIALS = [
ollama_credentials,
revid_credentials,
@@ -233,6 +242,7 @@ DEFAULT_CREDENTIALS = [
google_maps_credentials,
llama_api_credentials,
v0_credentials,
webshare_proxy_credentials,
]
@@ -321,6 +331,11 @@ class IntegrationCredentialsStore:
all_credentials.append(zerobounce_credentials)
if settings.secrets.google_maps_api_key:
all_credentials.append(google_maps_credentials)
if (
settings.secrets.webshare_proxy_username
and settings.secrets.webshare_proxy_password
):
all_credentials.append(webshare_proxy_credentials)
return all_credentials
async def get_creds_by_id(

View File

@@ -49,6 +49,7 @@ class ProviderName(str, Enum):
TODOIST = "todoist"
UNREAL_SPEECH = "unreal_speech"
V0 = "v0"
WEBSHARE_PROXY = "webshare_proxy"
ZEROBOUNCE = "zerobounce"
@classmethod

View File

@@ -571,6 +571,12 @@ class Secrets(UpdateTrackingModel["Secrets"], BaseSettings):
open_router_api_key: str = Field(default="", description="Open Router API Key")
llama_api_key: str = Field(default="", description="Llama API Key")
v0_api_key: str = Field(default="", description="v0 by Vercel API key")
webshare_proxy_username: str = Field(
default="", description="Webshare Proxy Username"
)
webshare_proxy_password: str = Field(
default="", description="Webshare Proxy Password"
)
reddit_client_id: str = Field(default="", description="Reddit client ID")
reddit_client_secret: str = Field(default="", description="Reddit client secret")

View File

@@ -1,10 +1,14 @@
from unittest.mock import Mock, patch
import pytest
from pydantic import SecretStr
from youtube_transcript_api._errors import NoTranscriptFound
from youtube_transcript_api._transcripts import FetchedTranscript, Transcript
from youtube_transcript_api.proxies import WebshareProxyConfig
from backend.blocks.youtube import TranscribeYoutubeVideoBlock
from backend.blocks.youtube import TEST_CREDENTIALS, TranscribeYoutubeVideoBlock
from backend.data.model import UserPasswordCredentials
from backend.integrations.providers import ProviderName
class TestTranscribeYoutubeVideoBlock:
@@ -13,6 +17,7 @@ class TestTranscribeYoutubeVideoBlock:
def setup_method(self):
"""Set up test fixtures."""
self.youtube_block = TranscribeYoutubeVideoBlock()
self.credentials = TEST_CREDENTIALS
def test_extract_video_id_standard_url(self):
"""Test extracting video ID from standard YouTube URL."""
@@ -42,10 +47,41 @@ class TestTranscribeYoutubeVideoBlock:
mock_api.fetch.return_value = mock_transcript
# Execute
result = TranscribeYoutubeVideoBlock.get_transcript("test_video_id")
result = self.youtube_block.get_transcript("test_video_id", self.credentials)
# Assert
assert result == mock_transcript
mock_api_class.assert_called_once()
proxy_config = mock_api_class.call_args[1]["proxy_config"]
assert isinstance(proxy_config, WebshareProxyConfig)
mock_api.fetch.assert_called_once_with(video_id="test_video_id")
mock_api.list.assert_not_called()
@patch("backend.blocks.youtube.YouTubeTranscriptApi")
def test_get_transcript_with_custom_credentials(self, mock_api_class):
"""Test getting transcript with custom proxy credentials."""
# Setup mock
mock_api = Mock()
mock_api_class.return_value = mock_api
mock_transcript = Mock(spec=FetchedTranscript)
mock_api.fetch.return_value = mock_transcript
credentials = UserPasswordCredentials(
provider=ProviderName.WEBSHARE_PROXY,
username=SecretStr("custom_user"),
password=SecretStr("custom_pass"),
)
# Execute
result = self.youtube_block.get_transcript("test_video_id", credentials)
# Assert
assert result == mock_transcript
mock_api_class.assert_called_once()
proxy_config = mock_api_class.call_args[1]["proxy_config"]
assert isinstance(proxy_config, WebshareProxyConfig)
assert proxy_config.proxy_username == "custom_user"
assert proxy_config.proxy_password == "custom_pass"
mock_api.fetch.assert_called_once_with(video_id="test_video_id")
mock_api.list.assert_not_called()
@@ -74,10 +110,11 @@ class TestTranscribeYoutubeVideoBlock:
mock_api.list.return_value = mock_transcript_list
# Execute
result = TranscribeYoutubeVideoBlock.get_transcript("test_video_id")
result = self.youtube_block.get_transcript("test_video_id", self.credentials)
# Assert
assert result == mock_fetched_transcript
mock_api_class.assert_called_once()
mock_api.fetch.assert_called_once_with(video_id="test_video_id")
mock_api.list.assert_called_once_with("test_video_id")
mock_transcript_hu.fetch.assert_called_once()
@@ -109,10 +146,11 @@ class TestTranscribeYoutubeVideoBlock:
mock_api.list.return_value = mock_transcript_list
# Execute
result = TranscribeYoutubeVideoBlock.get_transcript("test_video_id")
result = self.youtube_block.get_transcript("test_video_id", self.credentials)
# Assert - should use manually created transcript first
assert result == mock_fetched_manual
mock_api_class.assert_called_once()
mock_transcript_manual.fetch.assert_called_once()
mock_transcript_generated.fetch.assert_not_called()
@@ -137,4 +175,5 @@ class TestTranscribeYoutubeVideoBlock:
# Execute and assert exception is raised
with pytest.raises(NoTranscriptFound):
TranscribeYoutubeVideoBlock.get_transcript("test_video_id")
self.youtube_block.get_transcript("test_video_id", self.credentials)
mock_api_class.assert_called_once()