const puppeteer = require("puppeteer-extra"); const StealthPlugin = require("puppeteer-extra-plugin-stealth"); const randomUseragent = require("random-useragent"); const fs = require("fs"); const { randomSleep, simulateHumanBehavior, handleRateLimitedRequest, } = require("./utils.js"); puppeteer.use(StealthPlugin()); const INSTAGRAM_URL = "https://www.instagram.com"; const SESSION_FILE = "session_cookies.json"; async function loginWithSession( { username, password }, proxy = null, useExistingSession = true ) { const browserArgs = []; if (proxy) browserArgs.push(`--proxy-server=${proxy}`); const userAgent = randomUseragent.getRandom(); const browser = await puppeteer.launch({ headless: false, args: browserArgs, }); const page = await browser.newPage(); await page.setUserAgent(userAgent); // Set a large viewport to ensure modal behavior (Instagram shows modals on desktop/large screens) await page.setViewport({ width: 1920, // Standard desktop width height: 1080, // Standard desktop height }); // Set browser timezone await page.evaluateOnNewDocument(() => { Object.defineProperty(Intl.DateTimeFormat.prototype, "resolvedOptions", { value: function () { return { timeZone: "America/New_York" }; }, }); }); // Monitor for rate limit responses page.on("response", (response) => { if (response.status() === 429) { console.log( `WARNING: Rate limit detected (429) on ${response .url() .substring(0, 80)}...` ); } }); // Try to load existing session if available if (useExistingSession && fs.existsSync(SESSION_FILE)) { try { console.log("Found existing session, attempting to reuse..."); const sessionData = JSON.parse(fs.readFileSync(SESSION_FILE, "utf-8")); if (sessionData.cookies && sessionData.cookies.length > 0) { await page.setCookie(...sessionData.cookies); console.log( `Loaded ${sessionData.cookies.length} cookies from session` ); // Navigate to Instagram to check if session is valid await page.goto(INSTAGRAM_URL, { waitUntil: "networkidle2" }); await randomSleep(2000, 3000); // Check if we're logged in by looking for profile link or login page const isLoggedIn = await page.evaluate(() => { // If we see login/signup links, we're not logged in const loginLink = document.querySelector( 'a[href="/accounts/login/"]' ); return !loginLink; }); if (isLoggedIn) { console.log("Session is valid! Skipping login."); return { browser, page, sessionReused: true }; } else { console.log("Session expired, proceeding with fresh login..."); } } } catch (error) { console.log("Failed to load session, proceeding with fresh login..."); } } // Fresh login flow return await performLogin(page, { username, password }, browser); } async function performLogin(page, { username, password }, browser) { // Navigate to login page await handleRateLimitedRequest( page, async () => { await page.goto(`${INSTAGRAM_URL}/accounts/login/`, { waitUntil: "networkidle2", }); }, "during login page load" ); console.log("Waiting for login form to appear..."); // Wait for the actual login form to load await page.waitForSelector('input[name="username"]', { visible: true, timeout: 60000, }); console.log("Login form loaded!"); // Simulate human behavior await simulateHumanBehavior(page, { mouseMovements: 3, scrolls: 1 }); await randomSleep(500, 1000); await page.type('input[name="username"]', username, { delay: 130 }); await randomSleep(300, 700); await page.type('input[name="password"]', password, { delay: 120 }); await simulateHumanBehavior(page, { mouseMovements: 2, scrolls: 0 }); await randomSleep(500, 1000); await Promise.all([ page.click('button[type="submit"]'), page.waitForNavigation({ waitUntil: "networkidle2" }), ]); await randomSleep(1000, 2000); return { browser, page, sessionReused: false }; } async function extractSession(page) { // Return cookies/session tokens for reuse const cookies = await page.cookies(); return { cookies }; } async function getFollowingsList(page, targetUsername, maxUsers = 100) { const followingData = []; const followingUsernames = []; let requestCount = 0; const requestsPerBatch = 12; // Instagram typically returns ~12 users per request // Set up response listener to capture API responses (no need for request interception) page.on("response", async (response) => { const url = response.url(); // Intercept the following list API endpoint if (url.includes("/friendships/") && url.includes("/following/")) { try { const json = await response.json(); // Check for rate limit in response if (json.status === "fail" || json.message?.includes("rate limit")) { console.log("WARNING: Rate limit detected in API response"); return; } if (json.users && Array.isArray(json.users)) { json.users.forEach((user) => { if (followingData.length < maxUsers) { followingData.push({ pk: user.pk, pk_id: user.pk_id, username: user.username, full_name: user.full_name, profile_pic_url: user.profile_pic_url, is_verified: user.is_verified, is_private: user.is_private, fbid_v2: user.fbid_v2, latest_reel_media: user.latest_reel_media, account_badges: user.account_badges, }); followingUsernames.push(user.username); } }); requestCount++; console.log( `Captured ${followingData.length} users so far (Request #${requestCount})...` ); } } catch (err) { // Not JSON or parsing error, ignore } } }); await handleRateLimitedRequest( page, async () => { await page.goto(`${INSTAGRAM_URL}/${targetUsername}/`, { waitUntil: "networkidle2", }); }, `while loading profile @${targetUsername}` ); // Simulate browsing the profile before clicking following await simulateHumanBehavior(page, { mouseMovements: 4, scrolls: 2 }); await randomSleep(1000, 2000); await page.waitForSelector('a[href$="/following/"]', { timeout: 10000 }); // Hover over the following link before clicking await page.hover('a[href$="/following/"]'); await randomSleep(300, 600); await page.click('a[href$="/following/"]'); // Wait for either modal or page navigation await randomSleep(1500, 2500); // Detect if modal opened or if we navigated to a new page const layoutType = await page.evaluate(() => { const hasModal = !!document.querySelector('div[role="dialog"]'); const urlHasFollowing = window.location.pathname.includes("/following"); return { hasModal, urlHasFollowing }; }); if (layoutType.hasModal) { console.log("Following modal opened (desktop layout)"); } else if (layoutType.urlHasFollowing) { console.log("Navigated to following page (mobile/small viewport layout)"); } else { console.log("Warning: Could not detect following list layout"); } // Wait for the list content to load await randomSleep(1500, 2500); // Verify we can see the list items const hasListItems = await page.evaluate(() => { return ( document.querySelectorAll('div.x1qnrgzn, a[href*="following"]').length > 0 ); }); if (hasListItems) { console.log("Following list loaded successfully"); } else { console.log("Warning: List items not detected, but continuing..."); } // Scroll to load more users while simulating human behavior const totalRequests = Math.ceil(maxUsers / requestsPerBatch); let scrollAttempts = 0; const maxScrollAttempts = Math.min(totalRequests * 3, 50000); // Cap at 50k attempts let lastDataLength = 0; let noNewDataCount = 0; console.log( `Will attempt to scroll up to ${maxScrollAttempts} times to reach ${maxUsers} users...` ); while ( followingData.length < maxUsers && scrollAttempts < maxScrollAttempts ) { // Check if we're still getting new data if (followingData.length === lastDataLength) { noNewDataCount++; // If no new data after 8 consecutive scroll attempts, we've reached the end if (noNewDataCount >= 8) { console.log( `No new data after ${noNewDataCount} attempts. Reached end of list.` ); break; } if (noNewDataCount % 3 === 0) { console.log( `Still at ${followingData.length} users after ${noNewDataCount} scrolls...` ); } } else { if (noNewDataCount > 0) { console.log( `Got new data! Now at ${followingData.length} users (was stuck for ${noNewDataCount} attempts)` ); } noNewDataCount = 0; // Reset counter when we get new data lastDataLength = followingData.length; } // Every ~12 users loaded (one request completed), simulate human behavior if ( requestCount > 0 && requestCount % Math.max(1, Math.ceil(totalRequests / 5)) === 0 ) { await simulateHumanBehavior(page, { mouseMovements: 2, scrolls: 0, // We're manually controlling scroll below }); } // Occasionally move mouse while scrolling if (scrollAttempts % 5 === 0) { const viewport = await page.viewport(); await page.mouse.move( Math.floor(Math.random() * viewport.width), Math.floor(Math.random() * viewport.height), { steps: 10 } ); } // Scroll the dialog's scrollable container - comprehensive approach const scrollResult = await page.evaluate(() => { // Find the scrollable container inside the dialog const dialog = document.querySelector('div[role="dialog"]'); if (!dialog) { return { success: false, error: "No dialog found", scrolled: false }; } // Look for the scrollable div - it has overflow: hidden auto const scrollableElements = dialog.querySelectorAll("div"); let scrollContainer = null; for (const elem of scrollableElements) { const style = window.getComputedStyle(elem); const overflow = style.overflow || style.overflowY; // Check if element is scrollable if ( (overflow === "auto" || overflow === "scroll") && elem.scrollHeight > elem.clientHeight ) { scrollContainer = elem; break; } } if (!scrollContainer) { // Fallback: try specific class from your HTML scrollContainer = dialog.querySelector("div.x6nl9eh") || dialog.querySelector('div[style*="overflow"]'); } if (!scrollContainer) { return { success: false, error: "No scrollable container found", scrolled: false, }; } const oldScrollTop = scrollContainer.scrollTop; const scrollHeight = scrollContainer.scrollHeight; const clientHeight = scrollContainer.clientHeight; // Scroll down scrollContainer.scrollTop += 400 + Math.floor(Math.random() * 200); const newScrollTop = scrollContainer.scrollTop; const actuallyScrolled = newScrollTop > oldScrollTop; const atBottom = scrollHeight - newScrollTop - clientHeight < 50; return { success: true, scrolled: actuallyScrolled, atBottom: atBottom, scrollTop: newScrollTop, scrollHeight: scrollHeight, }; }); if (!scrollResult.success) { console.log(`Scroll error: ${scrollResult.error}`); // Try alternative: scroll the page itself await page.evaluate(() => window.scrollBy(0, 300)); } else if (!scrollResult.scrolled) { console.log("Reached scroll bottom - cannot scroll further"); } // Check if we've reached the bottom and loading indicator is visible const loadingStatus = await page.evaluate(() => { const loader = document.querySelector('svg[aria-label="Loading..."]'); if (!loader) { return { exists: false, visible: false, reachedBottom: true }; } // Check if loader is in viewport (visible) const rect = loader.getBoundingClientRect(); const isVisible = rect.top >= 0 && rect.left >= 0 && rect.bottom <= window.innerHeight && rect.right <= window.innerWidth; return { exists: true, visible: isVisible, reachedBottom: isVisible }; }); if (!loadingStatus.exists) { // No loading indicator at all - might have reached the actual end console.log("No loading indicator found - may have reached end of list"); } else if (loadingStatus.visible) { // Loader is visible, meaning we've scrolled to it console.log("Loading indicator visible, waiting for more data..."); await randomSleep(2500, 3500); // Wait longer for Instagram to load more } else { // Loader exists but not visible yet, keep scrolling await randomSleep(1500, 2500); } scrollAttempts++; // Progress update every 50 scrolls if (scrollAttempts % 50 === 0) { console.log( `Progress: ${followingData.length} users captured after ${scrollAttempts} scroll attempts...` ); } } console.log(`Total users captured: ${followingData.length}`); return { usernames: followingUsernames.slice(0, maxUsers), fullData: followingData.slice(0, maxUsers), }; } async function scrapeProfile(page, username) { console.log(`Scraping profile: @${username}`); let profileData = { username }; let dataCapture = false; // Set up response listener to intercept API calls const responseHandler = async (response) => { const url = response.url(); try { // Check for GraphQL or REST API endpoints if ( url.includes("/api/v1/users/web_profile_info/") || url.includes("/graphql/query") ) { const contentType = response.headers()["content-type"] || ""; if (!contentType.includes("json")) return; const json = await response.json(); // Handle web_profile_info endpoint (REST API) if (url.includes("web_profile_info") && json.data?.user) { if (dataCapture) return; // Already captured, skip duplicate const user = json.data.user; profileData = { username: user.username, full_name: user.full_name, bio: user.biography || "", followerCount: user.edge_followed_by?.count || 0, followingCount: user.edge_follow?.count || 0, profile_pic_url: user.hd_profile_pic_url_info?.url || user.profile_pic_url, is_verified: user.is_verified, is_private: user.is_private, is_business: user.is_business_account, category: user.category_name, external_url: user.external_url, email: null, phone: null, }; // Extract email/phone from bio if (profileData.bio) { const emailMatch = profileData.bio.match( /[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}/ ); profileData.email = emailMatch ? emailMatch[0] : null; const phoneMatch = profileData.bio.match( /(\+\d{1,3}[- ]?)?\d{10,14}/ ); profileData.phone = phoneMatch ? phoneMatch[0] : null; } dataCapture = true; } // Handle GraphQL endpoint else if (url.includes("graphql") && json.data?.user) { if (dataCapture) return; // Already captured, skip duplicate const user = json.data.user; profileData = { username: user.username, full_name: user.full_name, bio: user.biography || "", followerCount: user.follower_count || 0, followingCount: user.following_count || 0, profile_pic_url: user.hd_profile_pic_url_info?.url || user.profile_pic_url, is_verified: user.is_verified, is_private: user.is_private, is_business: user.is_business_account || user.is_business, category: user.category_name || user.category, external_url: user.external_url, email: null, phone: null, }; // Extract email/phone from bio if (profileData.bio) { const emailMatch = profileData.bio.match( /[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}/ ); profileData.email = emailMatch ? emailMatch[0] : null; const phoneMatch = profileData.bio.match( /(\+\d{1,3}[- ]?)?\d{10,14}/ ); profileData.phone = phoneMatch ? phoneMatch[0] : null; } dataCapture = true; } } } catch (e) { // Ignore errors from parsing non-JSON responses } }; page.on("response", responseHandler); // Navigate to profile page await handleRateLimitedRequest( page, async () => { await page.goto(`${INSTAGRAM_URL}/${username}/`, { waitUntil: "domcontentloaded", }); }, `while loading profile @${username}` ); // Wait for API calls to complete await randomSleep(2000, 3000); // Remove listener page.off("response", responseHandler); // If API capture worked, return the data if (dataCapture) { return profileData; } // Otherwise, fall back to DOM scraping console.log(`⚠️ API capture failed for @${username}, using DOM fallback...`); return await scrapeProfileFallback(page, username); } // Fallback function using DOM scraping async function scrapeProfileFallback(page, username) { console.log(`Using DOM scraping for @${username}...`); const domData = await page.evaluate(() => { // Try multiple selectors for bio let bio = ""; const bioSelectors = [ "span._ap3a._aaco._aacu._aacx._aad7._aade", // Updated bio class (2025) "span._ap3a._aaco._aacu._aacx._aad6._aade", // Previous bio class "div._aacl._aaco._aacu._aacx._aad7._aade", // Alternative bio with _aad7 "div._aacl._aaco._aacu._aacx._aad6._aade", // Alternative bio with _aad6 "h1 + div span", // Bio after username "header section div span", // Generic header bio 'div.x7a106z span[dir="auto"]', // Bio container with dir attribute ]; for (const selector of bioSelectors) { const elem = document.querySelector(selector); if (elem && elem.innerText && elem.innerText.length > 3) { bio = elem.innerText; break; } } // Get follower/following counts using href-based selectors (stable) let followerCount = 0; let followingCount = 0; // Method 1: Find by href (most reliable) const followersLink = document.querySelector('a[href*="/followers/"]'); const followingLink = document.querySelector('a[href*="/following/"]'); if (followersLink) { const text = followersLink.innerText || followersLink.textContent || ""; const match = text.match(/[\d,\.]+/); if (match) { followerCount = match[0].replace(/,/g, "").replace(/\./g, ""); } } if (followingLink) { const text = followingLink.innerText || followingLink.textContent || ""; const match = text.match(/[\d,\.]+/); if (match) { followingCount = match[0].replace(/,/g, "").replace(/\./g, ""); } } // Alternative: Look in meta tags if href method fails if (!followerCount) { const metaContent = document.querySelector('meta[property="og:description"]')?.content || ""; const followerMatch = metaContent.match(/([\d,\.KMB]+)\s+Followers/i); const followingMatch = metaContent.match(/([\d,\.KMB]+)\s+Following/i); if (followerMatch) followerCount = followerMatch[1].replace(/,/g, ""); if (followingMatch) followingCount = followingMatch[1].replace(/,/g, ""); } // Extract email/phone from bio let emailMatch = bio.match( /[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}/ ); let email = emailMatch ? emailMatch[0] : null; let phoneMatch = bio.match(/(\+\d{1,3}[- ]?)?\d{10,14}/); let phone = phoneMatch ? phoneMatch[0] : null; return { bio, followerCount: parseInt(followerCount) || 0, followingCount: parseInt(followingCount) || 0, email, phone, }; }); return { username, ...domData, }; } async function cronJobs(fn, intervalSec, stopAfter = 0) { let runCount = 0; let stop = false; const timer = setInterval(async () => { if (stop || (stopAfter && runCount >= stopAfter)) { clearInterval(timer); return; } await fn(); runCount++; }, intervalSec * 1000); return () => { stop = true; }; } async function scrapeWorkflow( creds, targetUsername, proxy = null, maxFollowingToScrape = 10 ) { const { browser, page } = await login(creds, proxy); try { // Extract current session details for persistence const session = await extractSession(page); // Grab followings with full data const followingsData = await getFollowingsList( page, targetUsername, maxFollowingToScrape ); console.log( `Processing ${followingsData.usernames.length} following accounts...` ); for (let i = 0; i < followingsData.usernames.length; i++) { // Add occasional longer breaks to simulate human behavior if (i > 0 && i % 10 === 0) { console.log(`Taking a human-like break after ${i} profiles...`); await simulateHumanBehavior(page, { mouseMovements: 5, scrolls: 3 }); await randomSleep(5000, 10000); // Longer break every 10 profiles } const profileInfo = await scrapeProfile( page, followingsData.usernames[i] ); console.log(JSON.stringify(profileInfo)); // Implement rate limiting + anti-bot sleep await randomSleep(2500, 6000); } // Optionally return the full data for further processing return { session, followingsFullData: followingsData.fullData, scrapedProfiles: followingsData.usernames.length, }; } catch (err) { console.error("Scrape error:", err); } finally { await browser.close(); } } module.exports = { loginWithSession, extractSession, scrapeWorkflow, getFollowingsList, scrapeProfile, cronJobs, };