|
|
@@ -6,6 +6,8 @@
|
|
|
* ensuring at least INFINISCROLL_MIN_HEIGHT (default 16,000px) is reached.
|
|
|
* Stops early if no new content loads after a scroll.
|
|
|
*
|
|
|
+ * Optionally expands <details> elements and clicks "load more" buttons.
|
|
|
+ *
|
|
|
* Usage: on_Snapshot__45_infiniscroll.js --url=<url> --snapshot-id=<uuid>
|
|
|
* Output: JSONL with scroll stats (no files created)
|
|
|
*
|
|
|
@@ -16,6 +18,7 @@
|
|
|
* INFINISCROLL_SCROLL_DISTANCE: Pixels per scroll (default: 1600)
|
|
|
* INFINISCROLL_SCROLL_LIMIT: Max scroll iterations (default: 10)
|
|
|
* INFINISCROLL_MIN_HEIGHT: Min page height to reach in px (default: 16000)
|
|
|
+ * INFINISCROLL_EXPAND_DETAILS: Expand <details> and comments (default: true)
|
|
|
*/
|
|
|
|
|
|
function getEnv(name, defaultValue = '') {
|
|
|
@@ -91,6 +94,130 @@ function sleep(ms) {
|
|
|
return new Promise(resolve => setTimeout(resolve, ms));
|
|
|
}
|
|
|
|
|
|
+/**
|
|
|
+ * Expand <details> elements and click "load more" buttons for comments.
|
|
|
+ * Based on archivebox.ts expandComments function.
|
|
|
+ */
|
|
|
+async function expandDetails(page, options = {}) {
|
|
|
+ const {
|
|
|
+ timeout = 30000,
|
|
|
+ limit = 500,
|
|
|
+ delay = 500,
|
|
|
+ } = options;
|
|
|
+
|
|
|
+ const startTime = Date.now();
|
|
|
+
|
|
|
+ // First, expand all <details> elements
|
|
|
+ const detailsExpanded = await page.evaluate(() => {
|
|
|
+ let count = 0;
|
|
|
+ // Generic <details> elements
|
|
|
+ document.querySelectorAll('details:not([open])').forEach(el => {
|
|
|
+ el.open = true;
|
|
|
+ count++;
|
|
|
+ });
|
|
|
+ // Github README details sections
|
|
|
+ document.querySelectorAll('article details:not([open])').forEach(el => {
|
|
|
+ el.open = true;
|
|
|
+ count++;
|
|
|
+ });
|
|
|
+ // Github issue discussion hidden comments
|
|
|
+ document.querySelectorAll('div.js-discussion details:not(.details-overlay):not([open])').forEach(el => {
|
|
|
+ el.open = true;
|
|
|
+ count++;
|
|
|
+ });
|
|
|
+ // HedgeDoc/Markdown details sections
|
|
|
+ document.querySelectorAll('.markdown-body details:not([open])').forEach(el => {
|
|
|
+ el.open = true;
|
|
|
+ count++;
|
|
|
+ });
|
|
|
+ return count;
|
|
|
+ });
|
|
|
+
|
|
|
+ if (detailsExpanded > 0) {
|
|
|
+ console.error(`Expanded ${detailsExpanded} <details> elements`);
|
|
|
+ }
|
|
|
+
|
|
|
+ // Then click "load more" buttons for comments
|
|
|
+ const numExpanded = await page.evaluate(async ({ timeout, limit, delay }) => {
|
|
|
+ // Helper to find elements by XPath
|
|
|
+ function getElementsByXPath(xpath) {
|
|
|
+ const results = [];
|
|
|
+ const xpathResult = document.evaluate(
|
|
|
+ xpath,
|
|
|
+ document,
|
|
|
+ null,
|
|
|
+ XPathResult.ORDERED_NODE_ITERATOR_TYPE,
|
|
|
+ null
|
|
|
+ );
|
|
|
+ let node;
|
|
|
+ while ((node = xpathResult.iterateNext()) != null) {
|
|
|
+ results.push(node);
|
|
|
+ }
|
|
|
+ return results;
|
|
|
+ }
|
|
|
+
|
|
|
+ const wait = (ms) => new Promise(res => setTimeout(res, ms));
|
|
|
+
|
|
|
+ // Find all "load more" type buttons/links
|
|
|
+ const getLoadMoreLinks = () => [
|
|
|
+ // Reddit (new)
|
|
|
+ ...document.querySelectorAll('faceplate-partial[loading=action]'),
|
|
|
+ // Reddit (old) - show more replies
|
|
|
+ ...document.querySelectorAll('a[onclick^="return morechildren"]'),
|
|
|
+ // Reddit (old) - show hidden replies
|
|
|
+ ...document.querySelectorAll('a[onclick^="return togglecomment"]'),
|
|
|
+ // Twitter/X - show more replies
|
|
|
+ ...getElementsByXPath("//*[text()='Show more replies']"),
|
|
|
+ ...getElementsByXPath("//*[text()='Show replies']"),
|
|
|
+ // Generic "load more" / "show more" buttons
|
|
|
+ ...getElementsByXPath("//*[contains(text(),'Load more')]"),
|
|
|
+ ...getElementsByXPath("//*[contains(text(),'Show more')]"),
|
|
|
+ // Hacker News
|
|
|
+ ...document.querySelectorAll('a.morelink'),
|
|
|
+ ];
|
|
|
+
|
|
|
+ let expanded = 0;
|
|
|
+ let loadMoreLinks = getLoadMoreLinks();
|
|
|
+ const startTime = Date.now();
|
|
|
+
|
|
|
+ while (loadMoreLinks.length > 0) {
|
|
|
+ for (const link of loadMoreLinks) {
|
|
|
+ // Skip certain elements
|
|
|
+ if (link.slot === 'children') continue;
|
|
|
+
|
|
|
+ try {
|
|
|
+ link.scrollIntoView({ behavior: 'smooth' });
|
|
|
+ link.click();
|
|
|
+ expanded++;
|
|
|
+ await wait(delay);
|
|
|
+ } catch (e) {
|
|
|
+ // Ignore click errors
|
|
|
+ }
|
|
|
+
|
|
|
+ // Check limits
|
|
|
+ if (expanded >= limit) return expanded;
|
|
|
+ if (Date.now() - startTime >= timeout) return expanded;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Check for new load more links after clicking
|
|
|
+ await wait(delay);
|
|
|
+ loadMoreLinks = getLoadMoreLinks();
|
|
|
+ }
|
|
|
+
|
|
|
+ return expanded;
|
|
|
+ }, { timeout, limit, delay });
|
|
|
+
|
|
|
+ if (numExpanded > 0) {
|
|
|
+ console.error(`Clicked ${numExpanded} "load more" buttons`);
|
|
|
+ }
|
|
|
+
|
|
|
+ return {
|
|
|
+ detailsExpanded,
|
|
|
+ commentsExpanded: numExpanded,
|
|
|
+ total: detailsExpanded + numExpanded,
|
|
|
+ };
|
|
|
+}
|
|
|
+
|
|
|
async function scrollDown(page, options = {}) {
|
|
|
const {
|
|
|
timeout = 120000,
|
|
|
@@ -206,6 +333,7 @@ async function main() {
|
|
|
const scrollDistance = getEnvInt('INFINISCROLL_SCROLL_DISTANCE', 1600);
|
|
|
const scrollLimit = getEnvInt('INFINISCROLL_SCROLL_LIMIT', 10);
|
|
|
const minHeight = getEnvInt('INFINISCROLL_MIN_HEIGHT', 16000);
|
|
|
+ const expandDetailsEnabled = getEnvBool('INFINISCROLL_EXPAND_DETAILS', true);
|
|
|
|
|
|
const cdpUrl = getCdpUrl();
|
|
|
if (!cdpUrl) {
|
|
|
@@ -247,6 +375,18 @@ async function main() {
|
|
|
await page.setViewport({ width: resolution[0] || 1440, height: resolution[1] || 2000 });
|
|
|
|
|
|
console.error(`Starting infinite scroll on ${url}`);
|
|
|
+
|
|
|
+ // Expand <details> and comments before scrolling (if enabled)
|
|
|
+ let expandResult = { total: 0, detailsExpanded: 0, commentsExpanded: 0 };
|
|
|
+ if (expandDetailsEnabled) {
|
|
|
+ console.error('Expanding <details> and comments...');
|
|
|
+ expandResult = await expandDetails(page, {
|
|
|
+ timeout: Math.min(timeout / 4, 30000),
|
|
|
+ limit: 500,
|
|
|
+ delay: scrollDelay / 4,
|
|
|
+ });
|
|
|
+ }
|
|
|
+
|
|
|
const result = await scrollDown(page, {
|
|
|
timeout,
|
|
|
scrollDelay,
|
|
|
@@ -255,13 +395,26 @@ async function main() {
|
|
|
minHeight,
|
|
|
});
|
|
|
|
|
|
+ // Expand again after scrolling (new content may have loaded)
|
|
|
+ if (expandDetailsEnabled) {
|
|
|
+ const expandResult2 = await expandDetails(page, {
|
|
|
+ timeout: Math.min(timeout / 4, 30000),
|
|
|
+ limit: 500,
|
|
|
+ delay: scrollDelay / 4,
|
|
|
+ });
|
|
|
+ expandResult.total += expandResult2.total;
|
|
|
+ expandResult.detailsExpanded += expandResult2.detailsExpanded;
|
|
|
+ expandResult.commentsExpanded += expandResult2.commentsExpanded;
|
|
|
+ }
|
|
|
+
|
|
|
browser.disconnect();
|
|
|
|
|
|
const elapsedSec = (result.elapsedMs / 1000).toFixed(1);
|
|
|
const finalHeightStr = result.finalHeight.toLocaleString();
|
|
|
const addedHeight = result.finalHeight - result.startingHeight;
|
|
|
const addedStr = addedHeight > 0 ? `+${addedHeight.toLocaleString()}px new content` : 'no new content';
|
|
|
- const outputStr = `scrolled to ${finalHeightStr}px (${addedStr}) over ${elapsedSec}s`;
|
|
|
+ const expandStr = expandResult.total > 0 ? `, expanded ${expandResult.total}` : '';
|
|
|
+ const outputStr = `scrolled to ${finalHeightStr}px (${addedStr}${expandStr}) over ${elapsedSec}s`;
|
|
|
|
|
|
console.error(`Success: ${outputStr}`);
|
|
|
console.log(JSON.stringify({
|