724 lines
22 KiB
JavaScript
724 lines
22 KiB
JavaScript
const puppeteer = require("puppeteer-extra");
|
|
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
|
|
const randomUseragent = require("random-useragent");
|
|
const fs = require("fs");
|
|
const {
|
|
randomSleep,
|
|
simulateHumanBehavior,
|
|
handleRateLimitedRequest,
|
|
} = require("./utils.js");
|
|
|
|
puppeteer.use(StealthPlugin());
|
|
|
|
const INSTAGRAM_URL = "https://www.instagram.com";
|
|
const SESSION_FILE = "session_cookies.json";
|
|
|
|
async function loginWithSession(
|
|
{ username, password },
|
|
proxy = null,
|
|
useExistingSession = true
|
|
) {
|
|
const browserArgs = [];
|
|
if (proxy) browserArgs.push(`--proxy-server=${proxy}`);
|
|
const userAgent = randomUseragent.getRandom();
|
|
|
|
const browser = await puppeteer.launch({
|
|
headless: false,
|
|
args: browserArgs,
|
|
});
|
|
const page = await browser.newPage();
|
|
await page.setUserAgent(userAgent);
|
|
|
|
// Set a large viewport to ensure modal behavior (Instagram shows modals on desktop/large screens)
|
|
await page.setViewport({
|
|
width: 1920, // Standard desktop width
|
|
height: 1080, // Standard desktop height
|
|
});
|
|
|
|
// Set browser timezone
|
|
await page.evaluateOnNewDocument(() => {
|
|
Object.defineProperty(Intl.DateTimeFormat.prototype, "resolvedOptions", {
|
|
value: function () {
|
|
return { timeZone: "America/New_York" };
|
|
},
|
|
});
|
|
});
|
|
|
|
// Monitor for rate limit responses
|
|
page.on("response", (response) => {
|
|
if (response.status() === 429) {
|
|
console.log(
|
|
`WARNING: Rate limit detected (429) on ${response
|
|
.url()
|
|
.substring(0, 80)}...`
|
|
);
|
|
}
|
|
});
|
|
|
|
// Try to load existing session if available
|
|
if (useExistingSession && fs.existsSync(SESSION_FILE)) {
|
|
try {
|
|
console.log("Found existing session, attempting to reuse...");
|
|
const sessionData = JSON.parse(fs.readFileSync(SESSION_FILE, "utf-8"));
|
|
|
|
if (sessionData.cookies && sessionData.cookies.length > 0) {
|
|
await page.setCookie(...sessionData.cookies);
|
|
console.log(
|
|
`Loaded ${sessionData.cookies.length} cookies from session`
|
|
);
|
|
|
|
// Navigate to Instagram to check if session is valid
|
|
await page.goto(INSTAGRAM_URL, { waitUntil: "networkidle2" });
|
|
await randomSleep(2000, 3000);
|
|
|
|
// Check if we're logged in by looking for profile link or login page
|
|
const isLoggedIn = await page.evaluate(() => {
|
|
// If we see login/signup links, we're not logged in
|
|
const loginLink = document.querySelector(
|
|
'a[href="/accounts/login/"]'
|
|
);
|
|
return !loginLink;
|
|
});
|
|
|
|
if (isLoggedIn) {
|
|
console.log("Session is valid! Skipping login.");
|
|
return { browser, page, sessionReused: true };
|
|
} else {
|
|
console.log("Session expired, proceeding with fresh login...");
|
|
}
|
|
}
|
|
} catch (error) {
|
|
console.log("Failed to load session, proceeding with fresh login...");
|
|
}
|
|
}
|
|
|
|
// Fresh login flow
|
|
return await performLogin(page, { username, password }, browser);
|
|
}
|
|
|
|
async function performLogin(page, { username, password }, browser) {
|
|
// Navigate to login page
|
|
await handleRateLimitedRequest(
|
|
page,
|
|
async () => {
|
|
await page.goto(`${INSTAGRAM_URL}/accounts/login/`, {
|
|
waitUntil: "networkidle2",
|
|
});
|
|
},
|
|
"during login page load"
|
|
);
|
|
|
|
console.log("Waiting for login form to appear...");
|
|
|
|
// Wait for the actual login form to load
|
|
await page.waitForSelector('input[name="username"]', {
|
|
visible: true,
|
|
timeout: 60000,
|
|
});
|
|
|
|
console.log("Login form loaded!");
|
|
|
|
// Simulate human behavior
|
|
await simulateHumanBehavior(page, { mouseMovements: 3, scrolls: 1 });
|
|
await randomSleep(500, 1000);
|
|
|
|
await page.type('input[name="username"]', username, { delay: 130 });
|
|
await randomSleep(300, 700);
|
|
await page.type('input[name="password"]', password, { delay: 120 });
|
|
|
|
await simulateHumanBehavior(page, { mouseMovements: 2, scrolls: 0 });
|
|
await randomSleep(500, 1000);
|
|
|
|
await Promise.all([
|
|
page.click('button[type="submit"]'),
|
|
page.waitForNavigation({ waitUntil: "networkidle2" }),
|
|
]);
|
|
|
|
await randomSleep(1000, 2000);
|
|
|
|
return { browser, page, sessionReused: false };
|
|
}
|
|
|
|
async function extractSession(page) {
|
|
// Return cookies/session tokens for reuse
|
|
const cookies = await page.cookies();
|
|
return { cookies };
|
|
}
|
|
|
|
async function getFollowingsList(page, targetUsername, maxUsers = 100) {
|
|
const followingData = [];
|
|
const followingUsernames = [];
|
|
let requestCount = 0;
|
|
const requestsPerBatch = 12; // Instagram typically returns ~12 users per request
|
|
|
|
// Set up response listener to capture API responses (no need for request interception)
|
|
page.on("response", async (response) => {
|
|
const url = response.url();
|
|
|
|
// Intercept the following list API endpoint
|
|
if (url.includes("/friendships/") && url.includes("/following/")) {
|
|
try {
|
|
const json = await response.json();
|
|
|
|
// Check for rate limit in response
|
|
if (json.status === "fail" || json.message?.includes("rate limit")) {
|
|
console.log("WARNING: Rate limit detected in API response");
|
|
return;
|
|
}
|
|
|
|
if (json.users && Array.isArray(json.users)) {
|
|
json.users.forEach((user) => {
|
|
if (followingData.length < maxUsers) {
|
|
followingData.push({
|
|
pk: user.pk,
|
|
pk_id: user.pk_id,
|
|
username: user.username,
|
|
full_name: user.full_name,
|
|
profile_pic_url: user.profile_pic_url,
|
|
is_verified: user.is_verified,
|
|
is_private: user.is_private,
|
|
fbid_v2: user.fbid_v2,
|
|
latest_reel_media: user.latest_reel_media,
|
|
account_badges: user.account_badges,
|
|
});
|
|
followingUsernames.push(user.username);
|
|
}
|
|
});
|
|
|
|
requestCount++;
|
|
console.log(
|
|
`Captured ${followingData.length} users so far (Request #${requestCount})...`
|
|
);
|
|
}
|
|
} catch (err) {
|
|
// Not JSON or parsing error, ignore
|
|
}
|
|
}
|
|
});
|
|
|
|
await handleRateLimitedRequest(
|
|
page,
|
|
async () => {
|
|
await page.goto(`${INSTAGRAM_URL}/${targetUsername}/`, {
|
|
waitUntil: "networkidle2",
|
|
});
|
|
},
|
|
`while loading profile @${targetUsername}`
|
|
);
|
|
|
|
// Simulate browsing the profile before clicking following
|
|
await simulateHumanBehavior(page, { mouseMovements: 4, scrolls: 2 });
|
|
await randomSleep(1000, 2000);
|
|
|
|
await page.waitForSelector('a[href$="/following/"]', { timeout: 10000 });
|
|
|
|
// Hover over the following link before clicking
|
|
await page.hover('a[href$="/following/"]');
|
|
await randomSleep(300, 600);
|
|
|
|
await page.click('a[href$="/following/"]');
|
|
|
|
// Wait for either modal or page navigation
|
|
await randomSleep(1500, 2500);
|
|
|
|
// Detect if modal opened or if we navigated to a new page
|
|
const layoutType = await page.evaluate(() => {
|
|
const hasModal = !!document.querySelector('div[role="dialog"]');
|
|
const urlHasFollowing = window.location.pathname.includes("/following");
|
|
return { hasModal, urlHasFollowing };
|
|
});
|
|
|
|
if (layoutType.hasModal) {
|
|
console.log("Following modal opened (desktop layout)");
|
|
} else if (layoutType.urlHasFollowing) {
|
|
console.log("Navigated to following page (mobile/small viewport layout)");
|
|
} else {
|
|
console.log("Warning: Could not detect following list layout");
|
|
}
|
|
|
|
// Wait for the list content to load
|
|
await randomSleep(1500, 2500);
|
|
|
|
// Verify we can see the list items
|
|
const hasListItems = await page.evaluate(() => {
|
|
return (
|
|
document.querySelectorAll('div.x1qnrgzn, a[href*="following"]').length > 0
|
|
);
|
|
});
|
|
|
|
if (hasListItems) {
|
|
console.log("Following list loaded successfully");
|
|
} else {
|
|
console.log("Warning: List items not detected, but continuing...");
|
|
}
|
|
|
|
// Scroll to load more users while simulating human behavior
|
|
const totalRequests = Math.ceil(maxUsers / requestsPerBatch);
|
|
let scrollAttempts = 0;
|
|
const maxScrollAttempts = Math.min(totalRequests * 3, 50000); // Cap at 50k attempts
|
|
let lastDataLength = 0;
|
|
let noNewDataCount = 0;
|
|
|
|
console.log(
|
|
`Will attempt to scroll up to ${maxScrollAttempts} times to reach ${maxUsers} users...`
|
|
);
|
|
|
|
while (
|
|
followingData.length < maxUsers &&
|
|
scrollAttempts < maxScrollAttempts
|
|
) {
|
|
// Check if we're still getting new data
|
|
if (followingData.length === lastDataLength) {
|
|
noNewDataCount++;
|
|
// If no new data after 8 consecutive scroll attempts, we've reached the end
|
|
if (noNewDataCount >= 8) {
|
|
console.log(
|
|
`No new data after ${noNewDataCount} attempts. Reached end of list.`
|
|
);
|
|
break;
|
|
}
|
|
if (noNewDataCount % 3 === 0) {
|
|
console.log(
|
|
`Still at ${followingData.length} users after ${noNewDataCount} scrolls...`
|
|
);
|
|
}
|
|
} else {
|
|
if (noNewDataCount > 0) {
|
|
console.log(
|
|
`Got new data! Now at ${followingData.length} users (was stuck for ${noNewDataCount} attempts)`
|
|
);
|
|
}
|
|
noNewDataCount = 0; // Reset counter when we get new data
|
|
lastDataLength = followingData.length;
|
|
}
|
|
|
|
// Every ~12 users loaded (one request completed), simulate human behavior
|
|
if (
|
|
requestCount > 0 &&
|
|
requestCount % Math.max(1, Math.ceil(totalRequests / 5)) === 0
|
|
) {
|
|
await simulateHumanBehavior(page, {
|
|
mouseMovements: 2,
|
|
scrolls: 0, // We're manually controlling scroll below
|
|
});
|
|
}
|
|
|
|
// Occasionally move mouse while scrolling
|
|
if (scrollAttempts % 5 === 0) {
|
|
const viewport = await page.viewport();
|
|
await page.mouse.move(
|
|
Math.floor(Math.random() * viewport.width),
|
|
Math.floor(Math.random() * viewport.height),
|
|
{ steps: 10 }
|
|
);
|
|
}
|
|
|
|
// Scroll the dialog's scrollable container - comprehensive approach
|
|
const scrollResult = await page.evaluate(() => {
|
|
// Find the scrollable container inside the dialog
|
|
const dialog = document.querySelector('div[role="dialog"]');
|
|
if (!dialog) {
|
|
return { success: false, error: "No dialog found", scrolled: false };
|
|
}
|
|
|
|
// Look for the scrollable div - it has overflow: hidden auto
|
|
const scrollableElements = dialog.querySelectorAll("div");
|
|
let scrollContainer = null;
|
|
|
|
for (const elem of scrollableElements) {
|
|
const style = window.getComputedStyle(elem);
|
|
const overflow = style.overflow || style.overflowY;
|
|
|
|
// Check if element is scrollable
|
|
if (
|
|
(overflow === "auto" || overflow === "scroll") &&
|
|
elem.scrollHeight > elem.clientHeight
|
|
) {
|
|
scrollContainer = elem;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!scrollContainer) {
|
|
// Fallback: try specific class from your HTML
|
|
scrollContainer =
|
|
dialog.querySelector("div.x6nl9eh") ||
|
|
dialog.querySelector('div[style*="overflow"]');
|
|
}
|
|
|
|
if (!scrollContainer) {
|
|
return {
|
|
success: false,
|
|
error: "No scrollable container found",
|
|
scrolled: false,
|
|
};
|
|
}
|
|
|
|
const oldScrollTop = scrollContainer.scrollTop;
|
|
const scrollHeight = scrollContainer.scrollHeight;
|
|
const clientHeight = scrollContainer.clientHeight;
|
|
|
|
// Scroll down
|
|
scrollContainer.scrollTop += 400 + Math.floor(Math.random() * 200);
|
|
|
|
const newScrollTop = scrollContainer.scrollTop;
|
|
const actuallyScrolled = newScrollTop > oldScrollTop;
|
|
const atBottom = scrollHeight - newScrollTop - clientHeight < 50;
|
|
|
|
return {
|
|
success: true,
|
|
scrolled: actuallyScrolled,
|
|
atBottom: atBottom,
|
|
scrollTop: newScrollTop,
|
|
scrollHeight: scrollHeight,
|
|
};
|
|
});
|
|
|
|
if (!scrollResult.success) {
|
|
console.log(`Scroll error: ${scrollResult.error}`);
|
|
// Try alternative: scroll the page itself
|
|
await page.evaluate(() => window.scrollBy(0, 300));
|
|
} else if (!scrollResult.scrolled) {
|
|
console.log("Reached scroll bottom - cannot scroll further");
|
|
}
|
|
|
|
// Check if we've reached the bottom and loading indicator is visible
|
|
const loadingStatus = await page.evaluate(() => {
|
|
const loader = document.querySelector('svg[aria-label="Loading..."]');
|
|
|
|
if (!loader) {
|
|
return { exists: false, visible: false, reachedBottom: true };
|
|
}
|
|
|
|
// Check if loader is in viewport (visible)
|
|
const rect = loader.getBoundingClientRect();
|
|
const isVisible =
|
|
rect.top >= 0 &&
|
|
rect.left >= 0 &&
|
|
rect.bottom <= window.innerHeight &&
|
|
rect.right <= window.innerWidth;
|
|
|
|
return { exists: true, visible: isVisible, reachedBottom: isVisible };
|
|
});
|
|
|
|
if (!loadingStatus.exists) {
|
|
// No loading indicator at all - might have reached the actual end
|
|
console.log("No loading indicator found - may have reached end of list");
|
|
} else if (loadingStatus.visible) {
|
|
// Loader is visible, meaning we've scrolled to it
|
|
console.log("Loading indicator visible, waiting for more data...");
|
|
await randomSleep(2500, 3500); // Wait longer for Instagram to load more
|
|
} else {
|
|
// Loader exists but not visible yet, keep scrolling
|
|
await randomSleep(1500, 2500);
|
|
}
|
|
|
|
scrollAttempts++;
|
|
|
|
// Progress update every 50 scrolls
|
|
if (scrollAttempts % 50 === 0) {
|
|
console.log(
|
|
`Progress: ${followingData.length} users captured after ${scrollAttempts} scroll attempts...`
|
|
);
|
|
}
|
|
}
|
|
|
|
console.log(`Total users captured: ${followingData.length}`);
|
|
|
|
return {
|
|
usernames: followingUsernames.slice(0, maxUsers),
|
|
fullData: followingData.slice(0, maxUsers),
|
|
};
|
|
}
|
|
|
|
async function scrapeProfile(page, username) {
|
|
console.log(`Scraping profile: @${username}`);
|
|
|
|
let profileData = { username };
|
|
let dataCapture = false;
|
|
|
|
// Set up response listener to intercept API calls
|
|
const responseHandler = async (response) => {
|
|
const url = response.url();
|
|
|
|
try {
|
|
// Check for GraphQL or REST API endpoints
|
|
if (
|
|
url.includes("/api/v1/users/web_profile_info/") ||
|
|
url.includes("/graphql/query")
|
|
) {
|
|
const contentType = response.headers()["content-type"] || "";
|
|
if (!contentType.includes("json")) return;
|
|
|
|
const json = await response.json();
|
|
|
|
// Handle web_profile_info endpoint (REST API)
|
|
if (url.includes("web_profile_info") && json.data?.user) {
|
|
if (dataCapture) return; // Already captured, skip duplicate
|
|
|
|
const user = json.data.user;
|
|
profileData = {
|
|
username: user.username,
|
|
full_name: user.full_name,
|
|
bio: user.biography || "",
|
|
followerCount: user.edge_followed_by?.count || 0,
|
|
followingCount: user.edge_follow?.count || 0,
|
|
profile_pic_url:
|
|
user.hd_profile_pic_url_info?.url || user.profile_pic_url,
|
|
is_verified: user.is_verified,
|
|
is_private: user.is_private,
|
|
is_business: user.is_business_account,
|
|
category: user.category_name,
|
|
external_url: user.external_url,
|
|
email: null,
|
|
phone: null,
|
|
};
|
|
|
|
// Extract email/phone from bio
|
|
if (profileData.bio) {
|
|
const emailMatch = profileData.bio.match(
|
|
/[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}/
|
|
);
|
|
profileData.email = emailMatch ? emailMatch[0] : null;
|
|
|
|
const phoneMatch = profileData.bio.match(
|
|
/(\+\d{1,3}[- ]?)?\d{10,14}/
|
|
);
|
|
profileData.phone = phoneMatch ? phoneMatch[0] : null;
|
|
}
|
|
|
|
dataCapture = true;
|
|
}
|
|
// Handle GraphQL endpoint
|
|
else if (url.includes("graphql") && json.data?.user) {
|
|
if (dataCapture) return; // Already captured, skip duplicate
|
|
|
|
const user = json.data.user;
|
|
profileData = {
|
|
username: user.username,
|
|
full_name: user.full_name,
|
|
bio: user.biography || "",
|
|
followerCount: user.follower_count || 0,
|
|
followingCount: user.following_count || 0,
|
|
profile_pic_url:
|
|
user.hd_profile_pic_url_info?.url || user.profile_pic_url,
|
|
is_verified: user.is_verified,
|
|
is_private: user.is_private,
|
|
is_business: user.is_business_account || user.is_business,
|
|
category: user.category_name || user.category,
|
|
external_url: user.external_url,
|
|
email: null,
|
|
phone: null,
|
|
};
|
|
|
|
// Extract email/phone from bio
|
|
if (profileData.bio) {
|
|
const emailMatch = profileData.bio.match(
|
|
/[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}/
|
|
);
|
|
profileData.email = emailMatch ? emailMatch[0] : null;
|
|
|
|
const phoneMatch = profileData.bio.match(
|
|
/(\+\d{1,3}[- ]?)?\d{10,14}/
|
|
);
|
|
profileData.phone = phoneMatch ? phoneMatch[0] : null;
|
|
}
|
|
|
|
dataCapture = true;
|
|
}
|
|
}
|
|
} catch (e) {
|
|
// Ignore errors from parsing non-JSON responses
|
|
}
|
|
};
|
|
|
|
page.on("response", responseHandler);
|
|
|
|
// Navigate to profile page
|
|
await handleRateLimitedRequest(
|
|
page,
|
|
async () => {
|
|
await page.goto(`${INSTAGRAM_URL}/${username}/`, {
|
|
waitUntil: "domcontentloaded",
|
|
});
|
|
},
|
|
`while loading profile @${username}`
|
|
);
|
|
|
|
// Wait for API calls to complete
|
|
await randomSleep(2000, 3000);
|
|
|
|
// Remove listener
|
|
page.off("response", responseHandler);
|
|
|
|
// If API capture worked, return the data
|
|
if (dataCapture) {
|
|
return profileData;
|
|
}
|
|
|
|
// Otherwise, fall back to DOM scraping
|
|
console.log(`⚠️ API capture failed for @${username}, using DOM fallback...`);
|
|
return await scrapeProfileFallback(page, username);
|
|
}
|
|
|
|
// Fallback function using DOM scraping
|
|
async function scrapeProfileFallback(page, username) {
|
|
console.log(`Using DOM scraping for @${username}...`);
|
|
|
|
const domData = await page.evaluate(() => {
|
|
// Try multiple selectors for bio
|
|
let bio = "";
|
|
const bioSelectors = [
|
|
"span._ap3a._aaco._aacu._aacx._aad7._aade", // Updated bio class (2025)
|
|
"span._ap3a._aaco._aacu._aacx._aad6._aade", // Previous bio class
|
|
"div._aacl._aaco._aacu._aacx._aad7._aade", // Alternative bio with _aad7
|
|
"div._aacl._aaco._aacu._aacx._aad6._aade", // Alternative bio with _aad6
|
|
"h1 + div span", // Bio after username
|
|
"header section div span", // Generic header bio
|
|
'div.x7a106z span[dir="auto"]', // Bio container with dir attribute
|
|
];
|
|
|
|
for (const selector of bioSelectors) {
|
|
const elem = document.querySelector(selector);
|
|
if (elem && elem.innerText && elem.innerText.length > 3) {
|
|
bio = elem.innerText;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Get follower/following counts using href-based selectors (stable)
|
|
let followerCount = 0;
|
|
let followingCount = 0;
|
|
|
|
// Method 1: Find by href (most reliable)
|
|
const followersLink = document.querySelector('a[href*="/followers/"]');
|
|
const followingLink = document.querySelector('a[href*="/following/"]');
|
|
|
|
if (followersLink) {
|
|
const text = followersLink.innerText || followersLink.textContent || "";
|
|
const match = text.match(/[\d,\.]+/);
|
|
if (match) {
|
|
followerCount = match[0].replace(/,/g, "").replace(/\./g, "");
|
|
}
|
|
}
|
|
|
|
if (followingLink) {
|
|
const text = followingLink.innerText || followingLink.textContent || "";
|
|
const match = text.match(/[\d,\.]+/);
|
|
if (match) {
|
|
followingCount = match[0].replace(/,/g, "").replace(/\./g, "");
|
|
}
|
|
}
|
|
|
|
// Alternative: Look in meta tags if href method fails
|
|
if (!followerCount) {
|
|
const metaContent =
|
|
document.querySelector('meta[property="og:description"]')?.content ||
|
|
"";
|
|
const followerMatch = metaContent.match(/([\d,\.KMB]+)\s+Followers/i);
|
|
const followingMatch = metaContent.match(/([\d,\.KMB]+)\s+Following/i);
|
|
|
|
if (followerMatch) followerCount = followerMatch[1].replace(/,/g, "");
|
|
if (followingMatch) followingCount = followingMatch[1].replace(/,/g, "");
|
|
}
|
|
|
|
// Extract email/phone from bio
|
|
let emailMatch = bio.match(
|
|
/[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}/
|
|
);
|
|
let email = emailMatch ? emailMatch[0] : null;
|
|
let phoneMatch = bio.match(/(\+\d{1,3}[- ]?)?\d{10,14}/);
|
|
let phone = phoneMatch ? phoneMatch[0] : null;
|
|
|
|
return {
|
|
bio,
|
|
followerCount: parseInt(followerCount) || 0,
|
|
followingCount: parseInt(followingCount) || 0,
|
|
email,
|
|
phone,
|
|
};
|
|
});
|
|
|
|
return {
|
|
username,
|
|
...domData,
|
|
};
|
|
}
|
|
|
|
async function cronJobs(fn, intervalSec, stopAfter = 0) {
|
|
let runCount = 0;
|
|
let stop = false;
|
|
const timer = setInterval(async () => {
|
|
if (stop || (stopAfter && runCount >= stopAfter)) {
|
|
clearInterval(timer);
|
|
return;
|
|
}
|
|
await fn();
|
|
runCount++;
|
|
}, intervalSec * 1000);
|
|
return () => {
|
|
stop = true;
|
|
};
|
|
}
|
|
|
|
async function scrapeWorkflow(
|
|
creds,
|
|
targetUsername,
|
|
proxy = null,
|
|
maxFollowingToScrape = 10
|
|
) {
|
|
const { browser, page } = await login(creds, proxy);
|
|
try {
|
|
// Extract current session details for persistence
|
|
const session = await extractSession(page);
|
|
|
|
// Grab followings with full data
|
|
const followingsData = await getFollowingsList(
|
|
page,
|
|
targetUsername,
|
|
maxFollowingToScrape
|
|
);
|
|
|
|
console.log(
|
|
`Processing ${followingsData.usernames.length} following accounts...`
|
|
);
|
|
|
|
for (let i = 0; i < followingsData.usernames.length; i++) {
|
|
// Add occasional longer breaks to simulate human behavior
|
|
if (i > 0 && i % 10 === 0) {
|
|
console.log(`Taking a human-like break after ${i} profiles...`);
|
|
await simulateHumanBehavior(page, { mouseMovements: 5, scrolls: 3 });
|
|
await randomSleep(5000, 10000); // Longer break every 10 profiles
|
|
}
|
|
|
|
const profileInfo = await scrapeProfile(
|
|
page,
|
|
followingsData.usernames[i]
|
|
);
|
|
console.log(JSON.stringify(profileInfo));
|
|
// Implement rate limiting + anti-bot sleep
|
|
await randomSleep(2500, 6000);
|
|
}
|
|
|
|
// Optionally return the full data for further processing
|
|
return {
|
|
session,
|
|
followingsFullData: followingsData.fullData,
|
|
scrapedProfiles: followingsData.usernames.length,
|
|
};
|
|
} catch (err) {
|
|
console.error("Scrape error:", err);
|
|
} finally {
|
|
await browser.close();
|
|
}
|
|
}
|
|
|
|
module.exports = {
|
|
loginWithSession,
|
|
extractSession,
|
|
scrapeWorkflow,
|
|
getFollowingsList,
|
|
scrapeProfile,
|
|
cronJobs,
|
|
};
|