Files
instagram-scraper/scraper.js

724 lines
22 KiB
JavaScript

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
const randomUseragent = require("random-useragent");
const fs = require("fs");
const {
randomSleep,
simulateHumanBehavior,
handleRateLimitedRequest,
} = require("./utils.js");
puppeteer.use(StealthPlugin());
const INSTAGRAM_URL = "https://www.instagram.com";
const SESSION_FILE = "session_cookies.json";
async function loginWithSession(
{ username, password },
proxy = null,
useExistingSession = true
) {
const browserArgs = [];
if (proxy) browserArgs.push(`--proxy-server=${proxy}`);
const userAgent = randomUseragent.getRandom();
const browser = await puppeteer.launch({
headless: false,
args: browserArgs,
});
const page = await browser.newPage();
await page.setUserAgent(userAgent);
// Set a large viewport to ensure modal behavior (Instagram shows modals on desktop/large screens)
await page.setViewport({
width: 1920, // Standard desktop width
height: 1080, // Standard desktop height
});
// Set browser timezone
await page.evaluateOnNewDocument(() => {
Object.defineProperty(Intl.DateTimeFormat.prototype, "resolvedOptions", {
value: function () {
return { timeZone: "America/New_York" };
},
});
});
// Monitor for rate limit responses
page.on("response", (response) => {
if (response.status() === 429) {
console.log(
`WARNING: Rate limit detected (429) on ${response
.url()
.substring(0, 80)}...`
);
}
});
// Try to load existing session if available
if (useExistingSession && fs.existsSync(SESSION_FILE)) {
try {
console.log("Found existing session, attempting to reuse...");
const sessionData = JSON.parse(fs.readFileSync(SESSION_FILE, "utf-8"));
if (sessionData.cookies && sessionData.cookies.length > 0) {
await page.setCookie(...sessionData.cookies);
console.log(
`Loaded ${sessionData.cookies.length} cookies from session`
);
// Navigate to Instagram to check if session is valid
await page.goto(INSTAGRAM_URL, { waitUntil: "networkidle2" });
await randomSleep(2000, 3000);
// Check if we're logged in by looking for profile link or login page
const isLoggedIn = await page.evaluate(() => {
// If we see login/signup links, we're not logged in
const loginLink = document.querySelector(
'a[href="/accounts/login/"]'
);
return !loginLink;
});
if (isLoggedIn) {
console.log("Session is valid! Skipping login.");
return { browser, page, sessionReused: true };
} else {
console.log("Session expired, proceeding with fresh login...");
}
}
} catch (error) {
console.log("Failed to load session, proceeding with fresh login...");
}
}
// Fresh login flow
return await performLogin(page, { username, password }, browser);
}
async function performLogin(page, { username, password }, browser) {
// Navigate to login page
await handleRateLimitedRequest(
page,
async () => {
await page.goto(`${INSTAGRAM_URL}/accounts/login/`, {
waitUntil: "networkidle2",
});
},
"during login page load"
);
console.log("Waiting for login form to appear...");
// Wait for the actual login form to load
await page.waitForSelector('input[name="username"]', {
visible: true,
timeout: 60000,
});
console.log("Login form loaded!");
// Simulate human behavior
await simulateHumanBehavior(page, { mouseMovements: 3, scrolls: 1 });
await randomSleep(500, 1000);
await page.type('input[name="username"]', username, { delay: 130 });
await randomSleep(300, 700);
await page.type('input[name="password"]', password, { delay: 120 });
await simulateHumanBehavior(page, { mouseMovements: 2, scrolls: 0 });
await randomSleep(500, 1000);
await Promise.all([
page.click('button[type="submit"]'),
page.waitForNavigation({ waitUntil: "networkidle2" }),
]);
await randomSleep(1000, 2000);
return { browser, page, sessionReused: false };
}
async function extractSession(page) {
// Return cookies/session tokens for reuse
const cookies = await page.cookies();
return { cookies };
}
async function getFollowingsList(page, targetUsername, maxUsers = 100) {
const followingData = [];
const followingUsernames = [];
let requestCount = 0;
const requestsPerBatch = 12; // Instagram typically returns ~12 users per request
// Set up response listener to capture API responses (no need for request interception)
page.on("response", async (response) => {
const url = response.url();
// Intercept the following list API endpoint
if (url.includes("/friendships/") && url.includes("/following/")) {
try {
const json = await response.json();
// Check for rate limit in response
if (json.status === "fail" || json.message?.includes("rate limit")) {
console.log("WARNING: Rate limit detected in API response");
return;
}
if (json.users && Array.isArray(json.users)) {
json.users.forEach((user) => {
if (followingData.length < maxUsers) {
followingData.push({
pk: user.pk,
pk_id: user.pk_id,
username: user.username,
full_name: user.full_name,
profile_pic_url: user.profile_pic_url,
is_verified: user.is_verified,
is_private: user.is_private,
fbid_v2: user.fbid_v2,
latest_reel_media: user.latest_reel_media,
account_badges: user.account_badges,
});
followingUsernames.push(user.username);
}
});
requestCount++;
console.log(
`Captured ${followingData.length} users so far (Request #${requestCount})...`
);
}
} catch (err) {
// Not JSON or parsing error, ignore
}
}
});
await handleRateLimitedRequest(
page,
async () => {
await page.goto(`${INSTAGRAM_URL}/${targetUsername}/`, {
waitUntil: "networkidle2",
});
},
`while loading profile @${targetUsername}`
);
// Simulate browsing the profile before clicking following
await simulateHumanBehavior(page, { mouseMovements: 4, scrolls: 2 });
await randomSleep(1000, 2000);
await page.waitForSelector('a[href$="/following/"]', { timeout: 10000 });
// Hover over the following link before clicking
await page.hover('a[href$="/following/"]');
await randomSleep(300, 600);
await page.click('a[href$="/following/"]');
// Wait for either modal or page navigation
await randomSleep(1500, 2500);
// Detect if modal opened or if we navigated to a new page
const layoutType = await page.evaluate(() => {
const hasModal = !!document.querySelector('div[role="dialog"]');
const urlHasFollowing = window.location.pathname.includes("/following");
return { hasModal, urlHasFollowing };
});
if (layoutType.hasModal) {
console.log("Following modal opened (desktop layout)");
} else if (layoutType.urlHasFollowing) {
console.log("Navigated to following page (mobile/small viewport layout)");
} else {
console.log("Warning: Could not detect following list layout");
}
// Wait for the list content to load
await randomSleep(1500, 2500);
// Verify we can see the list items
const hasListItems = await page.evaluate(() => {
return (
document.querySelectorAll('div.x1qnrgzn, a[href*="following"]').length > 0
);
});
if (hasListItems) {
console.log("Following list loaded successfully");
} else {
console.log("Warning: List items not detected, but continuing...");
}
// Scroll to load more users while simulating human behavior
const totalRequests = Math.ceil(maxUsers / requestsPerBatch);
let scrollAttempts = 0;
const maxScrollAttempts = Math.min(totalRequests * 3, 50000); // Cap at 50k attempts
let lastDataLength = 0;
let noNewDataCount = 0;
console.log(
`Will attempt to scroll up to ${maxScrollAttempts} times to reach ${maxUsers} users...`
);
while (
followingData.length < maxUsers &&
scrollAttempts < maxScrollAttempts
) {
// Check if we're still getting new data
if (followingData.length === lastDataLength) {
noNewDataCount++;
// If no new data after 8 consecutive scroll attempts, we've reached the end
if (noNewDataCount >= 8) {
console.log(
`No new data after ${noNewDataCount} attempts. Reached end of list.`
);
break;
}
if (noNewDataCount % 3 === 0) {
console.log(
`Still at ${followingData.length} users after ${noNewDataCount} scrolls...`
);
}
} else {
if (noNewDataCount > 0) {
console.log(
`Got new data! Now at ${followingData.length} users (was stuck for ${noNewDataCount} attempts)`
);
}
noNewDataCount = 0; // Reset counter when we get new data
lastDataLength = followingData.length;
}
// Every ~12 users loaded (one request completed), simulate human behavior
if (
requestCount > 0 &&
requestCount % Math.max(1, Math.ceil(totalRequests / 5)) === 0
) {
await simulateHumanBehavior(page, {
mouseMovements: 2,
scrolls: 0, // We're manually controlling scroll below
});
}
// Occasionally move mouse while scrolling
if (scrollAttempts % 5 === 0) {
const viewport = await page.viewport();
await page.mouse.move(
Math.floor(Math.random() * viewport.width),
Math.floor(Math.random() * viewport.height),
{ steps: 10 }
);
}
// Scroll the dialog's scrollable container - comprehensive approach
const scrollResult = await page.evaluate(() => {
// Find the scrollable container inside the dialog
const dialog = document.querySelector('div[role="dialog"]');
if (!dialog) {
return { success: false, error: "No dialog found", scrolled: false };
}
// Look for the scrollable div - it has overflow: hidden auto
const scrollableElements = dialog.querySelectorAll("div");
let scrollContainer = null;
for (const elem of scrollableElements) {
const style = window.getComputedStyle(elem);
const overflow = style.overflow || style.overflowY;
// Check if element is scrollable
if (
(overflow === "auto" || overflow === "scroll") &&
elem.scrollHeight > elem.clientHeight
) {
scrollContainer = elem;
break;
}
}
if (!scrollContainer) {
// Fallback: try specific class from your HTML
scrollContainer =
dialog.querySelector("div.x6nl9eh") ||
dialog.querySelector('div[style*="overflow"]');
}
if (!scrollContainer) {
return {
success: false,
error: "No scrollable container found",
scrolled: false,
};
}
const oldScrollTop = scrollContainer.scrollTop;
const scrollHeight = scrollContainer.scrollHeight;
const clientHeight = scrollContainer.clientHeight;
// Scroll down
scrollContainer.scrollTop += 400 + Math.floor(Math.random() * 200);
const newScrollTop = scrollContainer.scrollTop;
const actuallyScrolled = newScrollTop > oldScrollTop;
const atBottom = scrollHeight - newScrollTop - clientHeight < 50;
return {
success: true,
scrolled: actuallyScrolled,
atBottom: atBottom,
scrollTop: newScrollTop,
scrollHeight: scrollHeight,
};
});
if (!scrollResult.success) {
console.log(`Scroll error: ${scrollResult.error}`);
// Try alternative: scroll the page itself
await page.evaluate(() => window.scrollBy(0, 300));
} else if (!scrollResult.scrolled) {
console.log("Reached scroll bottom - cannot scroll further");
}
// Check if we've reached the bottom and loading indicator is visible
const loadingStatus = await page.evaluate(() => {
const loader = document.querySelector('svg[aria-label="Loading..."]');
if (!loader) {
return { exists: false, visible: false, reachedBottom: true };
}
// Check if loader is in viewport (visible)
const rect = loader.getBoundingClientRect();
const isVisible =
rect.top >= 0 &&
rect.left >= 0 &&
rect.bottom <= window.innerHeight &&
rect.right <= window.innerWidth;
return { exists: true, visible: isVisible, reachedBottom: isVisible };
});
if (!loadingStatus.exists) {
// No loading indicator at all - might have reached the actual end
console.log("No loading indicator found - may have reached end of list");
} else if (loadingStatus.visible) {
// Loader is visible, meaning we've scrolled to it
console.log("Loading indicator visible, waiting for more data...");
await randomSleep(2500, 3500); // Wait longer for Instagram to load more
} else {
// Loader exists but not visible yet, keep scrolling
await randomSleep(1500, 2500);
}
scrollAttempts++;
// Progress update every 50 scrolls
if (scrollAttempts % 50 === 0) {
console.log(
`Progress: ${followingData.length} users captured after ${scrollAttempts} scroll attempts...`
);
}
}
console.log(`Total users captured: ${followingData.length}`);
return {
usernames: followingUsernames.slice(0, maxUsers),
fullData: followingData.slice(0, maxUsers),
};
}
async function scrapeProfile(page, username) {
console.log(`Scraping profile: @${username}`);
let profileData = { username };
let dataCapture = false;
// Set up response listener to intercept API calls
const responseHandler = async (response) => {
const url = response.url();
try {
// Check for GraphQL or REST API endpoints
if (
url.includes("/api/v1/users/web_profile_info/") ||
url.includes("/graphql/query")
) {
const contentType = response.headers()["content-type"] || "";
if (!contentType.includes("json")) return;
const json = await response.json();
// Handle web_profile_info endpoint (REST API)
if (url.includes("web_profile_info") && json.data?.user) {
if (dataCapture) return; // Already captured, skip duplicate
const user = json.data.user;
profileData = {
username: user.username,
full_name: user.full_name,
bio: user.biography || "",
followerCount: user.edge_followed_by?.count || 0,
followingCount: user.edge_follow?.count || 0,
profile_pic_url:
user.hd_profile_pic_url_info?.url || user.profile_pic_url,
is_verified: user.is_verified,
is_private: user.is_private,
is_business: user.is_business_account,
category: user.category_name,
external_url: user.external_url,
email: null,
phone: null,
};
// Extract email/phone from bio
if (profileData.bio) {
const emailMatch = profileData.bio.match(
/[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}/
);
profileData.email = emailMatch ? emailMatch[0] : null;
const phoneMatch = profileData.bio.match(
/(\+\d{1,3}[- ]?)?\d{10,14}/
);
profileData.phone = phoneMatch ? phoneMatch[0] : null;
}
dataCapture = true;
}
// Handle GraphQL endpoint
else if (url.includes("graphql") && json.data?.user) {
if (dataCapture) return; // Already captured, skip duplicate
const user = json.data.user;
profileData = {
username: user.username,
full_name: user.full_name,
bio: user.biography || "",
followerCount: user.follower_count || 0,
followingCount: user.following_count || 0,
profile_pic_url:
user.hd_profile_pic_url_info?.url || user.profile_pic_url,
is_verified: user.is_verified,
is_private: user.is_private,
is_business: user.is_business_account || user.is_business,
category: user.category_name || user.category,
external_url: user.external_url,
email: null,
phone: null,
};
// Extract email/phone from bio
if (profileData.bio) {
const emailMatch = profileData.bio.match(
/[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}/
);
profileData.email = emailMatch ? emailMatch[0] : null;
const phoneMatch = profileData.bio.match(
/(\+\d{1,3}[- ]?)?\d{10,14}/
);
profileData.phone = phoneMatch ? phoneMatch[0] : null;
}
dataCapture = true;
}
}
} catch (e) {
// Ignore errors from parsing non-JSON responses
}
};
page.on("response", responseHandler);
// Navigate to profile page
await handleRateLimitedRequest(
page,
async () => {
await page.goto(`${INSTAGRAM_URL}/${username}/`, {
waitUntil: "domcontentloaded",
});
},
`while loading profile @${username}`
);
// Wait for API calls to complete
await randomSleep(2000, 3000);
// Remove listener
page.off("response", responseHandler);
// If API capture worked, return the data
if (dataCapture) {
return profileData;
}
// Otherwise, fall back to DOM scraping
console.log(`⚠️ API capture failed for @${username}, using DOM fallback...`);
return await scrapeProfileFallback(page, username);
}
// Fallback function using DOM scraping
async function scrapeProfileFallback(page, username) {
console.log(`Using DOM scraping for @${username}...`);
const domData = await page.evaluate(() => {
// Try multiple selectors for bio
let bio = "";
const bioSelectors = [
"span._ap3a._aaco._aacu._aacx._aad7._aade", // Updated bio class (2025)
"span._ap3a._aaco._aacu._aacx._aad6._aade", // Previous bio class
"div._aacl._aaco._aacu._aacx._aad7._aade", // Alternative bio with _aad7
"div._aacl._aaco._aacu._aacx._aad6._aade", // Alternative bio with _aad6
"h1 + div span", // Bio after username
"header section div span", // Generic header bio
'div.x7a106z span[dir="auto"]', // Bio container with dir attribute
];
for (const selector of bioSelectors) {
const elem = document.querySelector(selector);
if (elem && elem.innerText && elem.innerText.length > 3) {
bio = elem.innerText;
break;
}
}
// Get follower/following counts using href-based selectors (stable)
let followerCount = 0;
let followingCount = 0;
// Method 1: Find by href (most reliable)
const followersLink = document.querySelector('a[href*="/followers/"]');
const followingLink = document.querySelector('a[href*="/following/"]');
if (followersLink) {
const text = followersLink.innerText || followersLink.textContent || "";
const match = text.match(/[\d,\.]+/);
if (match) {
followerCount = match[0].replace(/,/g, "").replace(/\./g, "");
}
}
if (followingLink) {
const text = followingLink.innerText || followingLink.textContent || "";
const match = text.match(/[\d,\.]+/);
if (match) {
followingCount = match[0].replace(/,/g, "").replace(/\./g, "");
}
}
// Alternative: Look in meta tags if href method fails
if (!followerCount) {
const metaContent =
document.querySelector('meta[property="og:description"]')?.content ||
"";
const followerMatch = metaContent.match(/([\d,\.KMB]+)\s+Followers/i);
const followingMatch = metaContent.match(/([\d,\.KMB]+)\s+Following/i);
if (followerMatch) followerCount = followerMatch[1].replace(/,/g, "");
if (followingMatch) followingCount = followingMatch[1].replace(/,/g, "");
}
// Extract email/phone from bio
let emailMatch = bio.match(
/[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}/
);
let email = emailMatch ? emailMatch[0] : null;
let phoneMatch = bio.match(/(\+\d{1,3}[- ]?)?\d{10,14}/);
let phone = phoneMatch ? phoneMatch[0] : null;
return {
bio,
followerCount: parseInt(followerCount) || 0,
followingCount: parseInt(followingCount) || 0,
email,
phone,
};
});
return {
username,
...domData,
};
}
async function cronJobs(fn, intervalSec, stopAfter = 0) {
let runCount = 0;
let stop = false;
const timer = setInterval(async () => {
if (stop || (stopAfter && runCount >= stopAfter)) {
clearInterval(timer);
return;
}
await fn();
runCount++;
}, intervalSec * 1000);
return () => {
stop = true;
};
}
async function scrapeWorkflow(
creds,
targetUsername,
proxy = null,
maxFollowingToScrape = 10
) {
const { browser, page } = await login(creds, proxy);
try {
// Extract current session details for persistence
const session = await extractSession(page);
// Grab followings with full data
const followingsData = await getFollowingsList(
page,
targetUsername,
maxFollowingToScrape
);
console.log(
`Processing ${followingsData.usernames.length} following accounts...`
);
for (let i = 0; i < followingsData.usernames.length; i++) {
// Add occasional longer breaks to simulate human behavior
if (i > 0 && i % 10 === 0) {
console.log(`Taking a human-like break after ${i} profiles...`);
await simulateHumanBehavior(page, { mouseMovements: 5, scrolls: 3 });
await randomSleep(5000, 10000); // Longer break every 10 profiles
}
const profileInfo = await scrapeProfile(
page,
followingsData.usernames[i]
);
console.log(JSON.stringify(profileInfo));
// Implement rate limiting + anti-bot sleep
await randomSleep(2500, 6000);
}
// Optionally return the full data for further processing
return {
session,
followingsFullData: followingsData.fullData,
scrapedProfiles: followingsData.usernames.length,
};
} catch (err) {
console.error("Scrape error:", err);
} finally {
await browser.close();
}
}
module.exports = {
loginWithSession,
extractSession,
scrapeWorkflow,
getFollowingsList,
scrapeProfile,
cronJobs,
};