v0.6.22: Robust Linkkf extraction with zendriver fallback

2026-01-07 23:48:00 +09:00
parent 76be367a9e
commit d1866111c7
3 changed files with 135 additions and 66 deletions
@@ -81,7 +81,11 @@

 ## 📝 변경 이력 (Changelog)

-### v0.6.21 (2026-01-07)
+### v0.6.22 (2026-01-08)
+- **Linkkf 추출 로직 강화**: Cloudflare 보호가 강화된 Linkkf 도메인(flexora.xyz 등)에 대응하기 위해 브라우저 기반(Zendriver/Camoufox) 추출 엔진을 도입했습니다.
+- **오추출 방지**: 광고나 서비스 차단 페이지(Google Cloud 등)의 iframe을 비디오 URL로 오인하는 문제를 수정했습니다.
+
+## v0.6.21 (2026-01-07)
 - **Linkkf GDM 연동 수정**:
    - GDM 위임 전 실제 스트림 URL(m3u8) 추출 로직을 강제 호출하여 "Invalid data" 오류 해결.
    - Linkkf 설정의 다운로드 방식 및 쓰레드 수를 GDM에 전달하도록 개선.
@@ -1,5 +1,5 @@
 title: "애니 다운로더"
-version: "0.6.21"
+version: 0.6.22
 package_name: "anime_downloader"
 developer: "projectdx"
 description: "anime downloader"
@@ -41,6 +41,7 @@ import cloudscraper

 from anime_downloader.lib.ffmpeg_queue_v1 import FfmpegQueue, FfmpegQueueEntity
 from anime_downloader.lib.util import Util
+from .mod_ohli24 import LogicOhli24

 # 패키지
 # from .plugin import P
@@ -529,28 +530,79 @@ class LogicLinkkf(AnimeModuleBase):

    @staticmethod
    def get_html_cloudflare(url, cached=False, timeout=10):
-        """Cloudflare 보호 우회를 위한 HTTP 요청 (싱글톤 패턴)"""
-        user_agents_list = [
-            "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148",
-            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36",
-            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
-        ]
+        """Cloudflare 보호 우회를 위한 HTTP 요청 ( Zendriver Daemon -> Subprocess -> Camoufox -> Scraper 순)"""
+        start_time = time.time()
+        
+        # 0. Referer 설정
+        if LogicLinkkf.referer is None:
+            LogicLinkkf.referer = f"{P.ModelSetting.get('linkkf_url')}"
        
-        LogicLinkkf.headers["User-Agent"] = random.choice(user_agents_list)
        LogicLinkkf.headers["Referer"] = LogicLinkkf.referer or ""

-        # cloudscraper 싱글톤 패턴 - 매 요청마다 생성하지 않음
-        if LogicLinkkf._scraper is None:
-            LogicLinkkf._scraper = cloudscraper.create_scraper(
-                delay=10,
-                browser={"custom": "linkkf"},
-            )
+        # 1. Zendriver Daemon 시도 (최우선)
+        try:
+            if LogicOhli24.is_zendriver_daemon_running():
+                logger.info(f"[Linkkf] Trying Zendriver Daemon: {url}")
+                daemon_res = LogicOhli24.fetch_via_daemon(url, timeout=30)
+                if daemon_res.get("success") and daemon_res.get("html"):
+                    elapsed = time.time() - start_time
+                    logger.info(f"[Linkkf] Daemon success in {elapsed:.2f}s")
+                    return daemon_res["html"]
+        except Exception as e:
+            logger.warning(f"[Linkkf] Daemon error: {e}")

-        return LogicLinkkf._scraper.get(
-            url,
-            headers=LogicLinkkf.headers,
-            timeout=timeout,
-        ).content.decode("utf8", errors="replace")
+        # 2. Scraper 시도 (기본)
+        try:
+            if LogicLinkkf._scraper is None:
+                LogicLinkkf._scraper = cloudscraper.create_scraper(
+                    delay=10,
+                    browser={"custom": "linkkf"},
+                )
+            
+            user_agents_list = [
+                "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148",
+                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36",
+            ]
+            LogicLinkkf.headers["User-Agent"] = random.choice(user_agents_list)
+            
+            response = LogicLinkkf._scraper.get(url, headers=LogicLinkkf.headers, timeout=timeout)
+            
+            # 챌린지 페이지가 아닌 실제 콘텐츠가 포함되었는지 확인
+            content = response.text
+            if "Cloudflare" not in content or "video-player" in content or "iframe" in content:
+                return content
+            
+            logger.warning("[Linkkf] Scraper returned challenge page, falling back to browser...")
+        except Exception as e:
+            logger.warning(f"[Linkkf] Scraper error: {e}")
+
+        # 3. Zendriver Subprocess Fallback
+        try:
+            if LogicOhli24.ensure_zendriver_installed():
+                logger.info(f"[Linkkf] Trying Zendriver subprocess: {url}")
+                script_path = os.path.join(os.path.dirname(__file__), "lib", "zendriver_ohli24.py")
+                cmd = [sys.executable, script_path, url, str(30)]
+                result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
+                if result.returncode == 0 and result.stdout.strip():
+                    zd_result = json.loads(result.stdout.strip())
+                    if zd_result.get("success") and zd_result.get("html"):
+                        return zd_result["html"]
+        except Exception as e:
+            logger.warning(f"[Linkkf] Zendriver fallback error: {e}")
+
+        # 4. Camoufox Fallback
+        try:
+            logger.info(f"[Linkkf] Trying Camoufox fallback: {url}")
+            script_path = os.path.join(os.path.dirname(__file__), "lib", "camoufox_ohli24.py")
+            result = subprocess.run([sys.executable, script_path, url, str(30)], capture_output=True, text=True, timeout=60)
+            if result.returncode == 0 and result.stdout.strip():
+                cf_result = json.loads(result.stdout.strip())
+                if cf_result.get("success") and cf_result.get("html"):
+                    return cf_result["html"]
+        except Exception as e:
+            logger.warning(f"[Linkkf] Camoufox fallback error: {e}")
+
+        return ""

    @staticmethod
    def add_whitelist(*args):
@@ -632,81 +684,94 @@ class LogicLinkkf(AnimeModuleBase):
        try:
            logger.info(f"Extracting video URL from: {playid_url}")
            
-            # Step 1: playid 페이지에서 iframe src 추출
-            headers = {
-                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
-                "Referer": "https://linkkf.live/"
-            }
-            
-            response = requests.get(playid_url, headers=headers, timeout=15)
-            html_content = response.text
+            # Step 1: playid 페이지에서 iframe src 추출 (cloudscraper 사용)
+            html_content = LogicLinkkf.get_html(playid_url)
+            if not html_content:
+                logger.error(f"Failed to fetch playid page: {playid_url}")
+                return None, None, None
                
            soup = BeautifulSoup(html_content, "html.parser")
            
-            # iframe 찾기 (id="video-player-iframe" 또는 play.sub3.top 포함)
+            # iframe 찾기 (광고 iframe 제외를 위해 id나 src 패턴 강조)
            iframe = soup.select_one("iframe#video-player-iframe")
            if not iframe:
                iframe = soup.select_one("iframe[src*='play.sub']")
            if not iframe:
-                iframe = soup.select_one("iframe")
+                iframe = soup.select_one("iframe[src*='play.php']")
+            
+            # fallback if strictly needed but skip ad domains
+            if not iframe:
+                all_iframes = soup.select("iframe")
+                for f in all_iframes:
+                    src = f.get("src", "")
+                    if any(x in src for x in ["googletag", "googlead", "adsystem", "cloud.google"]): 
+                        continue
+                    if src.startswith("http"):
+                        iframe = f
+                        break
            
            if iframe and iframe.get("src"):
                iframe_src = iframe.get("src")
-                logger.info(f"Found iframe: {iframe_src}")
+                # HTML entity decoding (&#038; -> &)
+                if "&#038;" in iframe_src:
+                    iframe_src = iframe_src.replace("&#038;", "&")
+                
+                logger.info(f"Found player iframe: {iframe_src}")
                
                # Step 2: iframe 페이지에서 m3u8 URL과 vtt URL 추출
-                iframe_headers = {
-                    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
-                    "Referer": playid_url
-                }
+                iframe_content = LogicLinkkf.get_html(iframe_src)
+                if not iframe_content:
+                    logger.error(f"Failed to fetch iframe content: {iframe_src}")
+                    return None, iframe_src, None
                
-                iframe_response = requests.get(iframe_src, headers=iframe_headers, timeout=15)
-                iframe_content = iframe_response.text
-                
-                # m3u8 URL 패턴 찾기
-                # 예: url: 'https://n8.hlz3.top/403116s11/index.m3u8'
-                m3u8_pattern = re.compile(r"url:\s*['\"]([^'\"]*\.m3u8)['\"]")
+                # m3u8 URL 패턴 찾기 (더 정밀하게)
+                # 패턴 1: url: 'https://...m3u8'
+                m3u8_pattern = re.compile(r"url:\s*['\"]([^'\"]*\.m3u8[^'\"]*)['\"]")
                m3u8_match = m3u8_pattern.search(iframe_content)
                
+                # 패턴 2: <source src="https://...m3u8">
+                if not m3u8_match:
+                    source_pattern = re.compile(r"<source[^>]+src=['\"]([^'\"]*\.m3u8[^'\"]*)['\"]")
+                    m3u8_match = source_pattern.search(iframe_content)
+                
+                # 패턴 3: var src = '...m3u8'
+                if not m3u8_match:
+                    src_pattern = re.compile(r"src\s*=\s*['\"]([^'\"]*\.m3u8[^'\"]*)['\"]")
+                    m3u8_match = src_pattern.search(iframe_content)
+
                if m3u8_match:
                    video_url = m3u8_match.group(1)
                    # 상대 경로 처리 (예: cache/...)
-                    if video_url.startswith('cache/'):
+                    if video_url.startswith('cache/') or video_url.startswith('/cache/'):
                        from urllib.parse import urljoin
                        video_url = urljoin(iframe_src, video_url)
-                    logger.info(f"Found m3u8 URL: {video_url}")
+                    logger.info(f"Extracted m3u8 URL: {video_url}")
                else:
-                    # 대안 패턴: source src
-                    source_pattern = re.compile(r"<source[^>]+src=['\"]([^'\"]+)['\"]")
-                    source_match = source_pattern.search(iframe_content)
-                    if source_match:
-                        video_url = source_match.group(1)
-                        if video_url.startswith('cache/'):
-                            from urllib.parse import urljoin
-                            video_url = urljoin(iframe_src, video_url)
-                        logger.info(f"Found source URL: {video_url}")
+                    logger.warning(f"m3u8 URL not found in iframe. Content snippet: {iframe_content[:200]}...")
                
                # VTT 자막 URL 추출
-                # 예: <track src="https://...vtt" kind="subtitles">
-                vtt_pattern = re.compile(r"<track[^>]+src=['\"]([^'\"]*\.vtt)['\"]")
+                vtt_pattern = re.compile(r"['\"]src['\"]?:\s*['\"]([^'\"]*\.vtt)['\"]")
                vtt_match = vtt_pattern.search(iframe_content)
+                if not vtt_match:
+                    vtt_pattern2 = re.compile(r"url:\s*['\"]([^'\"]*\.vtt)['\"]")
+                    vtt_match = vtt_pattern2.search(iframe_content)
+                if not vtt_match:
+                    vtt_pattern3 = re.compile(r"<track[^>]+src=['\"]([^'\"]*\.vtt)['\"]")
+                    vtt_match = vtt_pattern3.search(iframe_content)
+                
                if vtt_match:
                    vtt_url = vtt_match.group(1)
-                    logger.info(f"Found VTT subtitle URL: {vtt_url}")
-                else:
-                    # 대안 패턴: url: '...vtt'
-                    vtt_pattern2 = re.compile(r"url:\s*['\"]([^'\"]*\.vtt)['\"]")
-                    vtt_match2 = vtt_pattern2.search(iframe_content)
-                    if vtt_match2:
-                        vtt_url = vtt_match2.group(1)
-                        logger.info(f"Found VTT subtitle URL (alt pattern): {vtt_url}")
+                    if vtt_url.startswith('/'):
+                        from urllib.parse import urljoin
+                        vtt_url = urljoin(iframe_src, vtt_url)
+                    logger.info(f"Extracted VTT URL: {vtt_url}")
                
                referer_url = iframe_src
            else:
-                logger.warning("No iframe found in playid page")
+                logger.warning(f"No player iframe found in playid page. HTML snippet: {html_content[:200]}...")
                
        except Exception as e:
-            logger.error(f"Error extracting video URL: {e}")
+            logger.error(f"Error in extract_video_url_from_playid: {e}")
            logger.error(traceback.format_exc())
        
        return video_url, referer_url, vtt_url