Browse Source

make infiniscroll plugin also expand details and comments sections

Nick Sweeting 1 month ago
parent
commit
8c69124935

+ 5 - 0
archivebox/plugins/infiniscroll/config.json

@@ -41,6 +41,11 @@
       "default": 16000,
       "minimum": 1000,
       "description": "Minimum page height to scroll to in pixels"
+    },
+    "INFINISCROLL_EXPAND_DETAILS": {
+      "type": "boolean",
+      "default": true,
+      "description": "Expand <details> elements and click 'load more' buttons for comments"
     }
   }
 }

+ 154 - 1
archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js

@@ -6,6 +6,8 @@
  * ensuring at least INFINISCROLL_MIN_HEIGHT (default 16,000px) is reached.
  * Stops early if no new content loads after a scroll.
  *
+ * Optionally expands <details> elements and clicks "load more" buttons.
+ *
  * Usage: on_Snapshot__45_infiniscroll.js --url=<url> --snapshot-id=<uuid>
  * Output: JSONL with scroll stats (no files created)
  *
@@ -16,6 +18,7 @@
  *     INFINISCROLL_SCROLL_DISTANCE: Pixels per scroll (default: 1600)
  *     INFINISCROLL_SCROLL_LIMIT: Max scroll iterations (default: 10)
  *     INFINISCROLL_MIN_HEIGHT: Min page height to reach in px (default: 16000)
+ *     INFINISCROLL_EXPAND_DETAILS: Expand <details> and comments (default: true)
  */
 
 function getEnv(name, defaultValue = '') {
@@ -91,6 +94,130 @@ function sleep(ms) {
     return new Promise(resolve => setTimeout(resolve, ms));
 }
 
+/**
+ * Expand <details> elements and click "load more" buttons for comments.
+ * Based on archivebox.ts expandComments function.
+ */
+async function expandDetails(page, options = {}) {
+    const {
+        timeout = 30000,
+        limit = 500,
+        delay = 500,
+    } = options;
+
+    const startTime = Date.now();
+
+    // First, expand all <details> elements
+    const detailsExpanded = await page.evaluate(() => {
+        let count = 0;
+        // Generic <details> elements
+        document.querySelectorAll('details:not([open])').forEach(el => {
+            el.open = true;
+            count++;
+        });
+        // Github README details sections
+        document.querySelectorAll('article details:not([open])').forEach(el => {
+            el.open = true;
+            count++;
+        });
+        // Github issue discussion hidden comments
+        document.querySelectorAll('div.js-discussion details:not(.details-overlay):not([open])').forEach(el => {
+            el.open = true;
+            count++;
+        });
+        // HedgeDoc/Markdown details sections
+        document.querySelectorAll('.markdown-body details:not([open])').forEach(el => {
+            el.open = true;
+            count++;
+        });
+        return count;
+    });
+
+    if (detailsExpanded > 0) {
+        console.error(`Expanded ${detailsExpanded} <details> elements`);
+    }
+
+    // Then click "load more" buttons for comments
+    const numExpanded = await page.evaluate(async ({ timeout, limit, delay }) => {
+        // Helper to find elements by XPath
+        function getElementsByXPath(xpath) {
+            const results = [];
+            const xpathResult = document.evaluate(
+                xpath,
+                document,
+                null,
+                XPathResult.ORDERED_NODE_ITERATOR_TYPE,
+                null
+            );
+            let node;
+            while ((node = xpathResult.iterateNext()) != null) {
+                results.push(node);
+            }
+            return results;
+        }
+
+        const wait = (ms) => new Promise(res => setTimeout(res, ms));
+
+        // Find all "load more" type buttons/links
+        const getLoadMoreLinks = () => [
+            // Reddit (new)
+            ...document.querySelectorAll('faceplate-partial[loading=action]'),
+            // Reddit (old) - show more replies
+            ...document.querySelectorAll('a[onclick^="return morechildren"]'),
+            // Reddit (old) - show hidden replies
+            ...document.querySelectorAll('a[onclick^="return togglecomment"]'),
+            // Twitter/X - show more replies
+            ...getElementsByXPath("//*[text()='Show more replies']"),
+            ...getElementsByXPath("//*[text()='Show replies']"),
+            // Generic "load more" / "show more" buttons
+            ...getElementsByXPath("//*[contains(text(),'Load more')]"),
+            ...getElementsByXPath("//*[contains(text(),'Show more')]"),
+            // Hacker News
+            ...document.querySelectorAll('a.morelink'),
+        ];
+
+        let expanded = 0;
+        let loadMoreLinks = getLoadMoreLinks();
+        const startTime = Date.now();
+
+        while (loadMoreLinks.length > 0) {
+            for (const link of loadMoreLinks) {
+                // Skip certain elements
+                if (link.slot === 'children') continue;
+
+                try {
+                    link.scrollIntoView({ behavior: 'smooth' });
+                    link.click();
+                    expanded++;
+                    await wait(delay);
+                } catch (e) {
+                    // Ignore click errors
+                }
+
+                // Check limits
+                if (expanded >= limit) return expanded;
+                if (Date.now() - startTime >= timeout) return expanded;
+            }
+
+            // Check for new load more links after clicking
+            await wait(delay);
+            loadMoreLinks = getLoadMoreLinks();
+        }
+
+        return expanded;
+    }, { timeout, limit, delay });
+
+    if (numExpanded > 0) {
+        console.error(`Clicked ${numExpanded} "load more" buttons`);
+    }
+
+    return {
+        detailsExpanded,
+        commentsExpanded: numExpanded,
+        total: detailsExpanded + numExpanded,
+    };
+}
+
 async function scrollDown(page, options = {}) {
     const {
         timeout = 120000,
@@ -206,6 +333,7 @@ async function main() {
     const scrollDistance = getEnvInt('INFINISCROLL_SCROLL_DISTANCE', 1600);
     const scrollLimit = getEnvInt('INFINISCROLL_SCROLL_LIMIT', 10);
     const minHeight = getEnvInt('INFINISCROLL_MIN_HEIGHT', 16000);
+    const expandDetailsEnabled = getEnvBool('INFINISCROLL_EXPAND_DETAILS', true);
 
     const cdpUrl = getCdpUrl();
     if (!cdpUrl) {
@@ -247,6 +375,18 @@ async function main() {
         await page.setViewport({ width: resolution[0] || 1440, height: resolution[1] || 2000 });
 
         console.error(`Starting infinite scroll on ${url}`);
+
+        // Expand <details> and comments before scrolling (if enabled)
+        let expandResult = { total: 0, detailsExpanded: 0, commentsExpanded: 0 };
+        if (expandDetailsEnabled) {
+            console.error('Expanding <details> and comments...');
+            expandResult = await expandDetails(page, {
+                timeout: Math.min(timeout / 4, 30000),
+                limit: 500,
+                delay: scrollDelay / 4,
+            });
+        }
+
         const result = await scrollDown(page, {
             timeout,
             scrollDelay,
@@ -255,13 +395,26 @@ async function main() {
             minHeight,
         });
 
+        // Expand again after scrolling (new content may have loaded)
+        if (expandDetailsEnabled) {
+            const expandResult2 = await expandDetails(page, {
+                timeout: Math.min(timeout / 4, 30000),
+                limit: 500,
+                delay: scrollDelay / 4,
+            });
+            expandResult.total += expandResult2.total;
+            expandResult.detailsExpanded += expandResult2.detailsExpanded;
+            expandResult.commentsExpanded += expandResult2.commentsExpanded;
+        }
+
         browser.disconnect();
 
         const elapsedSec = (result.elapsedMs / 1000).toFixed(1);
         const finalHeightStr = result.finalHeight.toLocaleString();
         const addedHeight = result.finalHeight - result.startingHeight;
         const addedStr = addedHeight > 0 ? `+${addedHeight.toLocaleString()}px new content` : 'no new content';
-        const outputStr = `scrolled to ${finalHeightStr}px (${addedStr}) over ${elapsedSec}s`;
+        const expandStr = expandResult.total > 0 ? `, expanded ${expandResult.total}` : '';
+        const outputStr = `scrolled to ${finalHeightStr}px (${addedStr}${expandStr}) over ${elapsedSec}s`;
 
         console.error(`Success: ${outputStr}`);
         console.log(JSON.stringify({