feat: Instagram scraper with GraphQL API integration - Automated followings list extraction via API interception - Profile scraping using GraphQL endpoint interception - DOM fallback for edge cases - Performance timing for all operations - Anti-bot measures and human-like behavior simulation

This commit is contained in:
2025-10-31 23:06:06 +05:45
parent ba2dcec881
commit 6f4f37bee5
8 changed files with 3474 additions and 0 deletions

356
server.js Normal file
View File

@@ -0,0 +1,356 @@
const {
loginWithSession,
extractSession,
scrapeWorkflow,
getFollowingsList,
scrapeProfile,
cronJobs,
} = require("./scraper.js");
const { randomSleep, simulateHumanBehavior } = require("./utils.js");
const fs = require("fs");
require("dotenv").config();
// Full workflow: Login, browse, scrape followings and profiles
async function fullScrapingWorkflow() {
console.log("Starting Instagram Full Scraping Workflow...\n");
// Start total timer
const totalStartTime = Date.now();
const credentials = {
username: process.env.INSTAGRAM_USERNAME || "your_username",
password: process.env.INSTAGRAM_PASSWORD || "your_password",
};
const targetUsername = process.env.TARGET_USERNAME || "instagram";
const maxFollowing = parseInt(process.env.MAX_FOLLOWING || "20", 10);
const maxProfilesToScrape = parseInt(process.env.MAX_PROFILES || "5", 10);
const proxy = process.env.PROXY || null;
let browser, page;
try {
console.log("Configuration:");
console.log(` Target: @${targetUsername}`);
console.log(` Max following to fetch: ${maxFollowing}`);
console.log(` Max profiles to scrape: ${maxProfilesToScrape}`);
console.log(` Proxy: ${proxy || "None"}\n`);
// Step 1: Login (with session reuse)
console.log("Step 1: Logging in to Instagram...");
const loginResult = await loginWithSession(credentials, proxy, true);
browser = loginResult.browser;
page = loginResult.page;
if (loginResult.sessionReused) {
console.log("Reused existing session!\n");
} else {
console.log("Fresh login successful!\n");
}
// Step 2: Extract and save session
console.log("Step 2: Extracting session cookies...");
const session = await extractSession(page);
fs.writeFileSync("session_cookies.json", JSON.stringify(session, null, 2));
console.log(`Session saved (${session.cookies.length} cookies)\n`);
// Step 3: Simulate browsing before scraping
console.log("Step 3: Simulating human browsing behavior...");
await simulateHumanBehavior(page, { mouseMovements: 5, scrolls: 3 });
await randomSleep(2000, 4000);
console.log("Browsing simulation complete\n");
// Step 4: Get followings list
console.log(`👥 Step 4: Fetching following list for @${targetUsername}...`);
const followingsStartTime = Date.now();
const followingsData = await getFollowingsList(
page,
targetUsername,
maxFollowing
);
const followingsEndTime = Date.now();
const followingsTime = (
(followingsEndTime - followingsStartTime) /
1000
).toFixed(2);
console.log(
`✓ Captured ${followingsData.fullData.length} followings in ${followingsTime}s\n`
);
// Save followings data
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
const followingsFile = `followings_${targetUsername}_${timestamp}.json`;
fs.writeFileSync(
followingsFile,
JSON.stringify(
{
targetUsername,
scrapedAt: new Date().toISOString(),
totalFollowings: followingsData.fullData.length,
followings: followingsData.fullData,
},
null,
2
)
);
console.log(`Followings data saved to: ${followingsFile}\n`);
// Step 5: Scrape individual profiles
console.log(
`📊 Step 5: Scraping ${maxProfilesToScrape} individual profiles...`
);
const profilesStartTime = Date.now();
const profilesData = [];
const usernamesToScrape = followingsData.usernames.slice(
0,
maxProfilesToScrape
);
for (let i = 0; i < usernamesToScrape.length; i++) {
const username = usernamesToScrape[i];
console.log(
` [${i + 1}/${usernamesToScrape.length}] Scraping @${username}...`
);
try {
const profileData = await scrapeProfile(page, username);
profilesData.push(profileData);
console.log(` @${username}: ${profileData.followerCount} followers`);
// Human-like delay between profiles
await randomSleep(3000, 6000);
// Take a longer break every 3 profiles
if ((i + 1) % 3 === 0 && i < usernamesToScrape.length - 1) {
console.log(" ⏸ Taking a human-like break...");
await simulateHumanBehavior(page, { mouseMovements: 4, scrolls: 2 });
await randomSleep(8000, 12000);
}
} catch (error) {
console.log(` Failed to scrape @${username}: ${error.message}`);
}
}
const profilesEndTime = Date.now();
const profilesTime = ((profilesEndTime - profilesStartTime) / 1000).toFixed(
2
);
console.log(
`\n✓ Scraped ${profilesData.length} profiles in ${profilesTime}s\n`
);
// Step 6: Save profiles data
console.log("Step 6: Saving profile data...");
const profilesFile = `profiles_${targetUsername}_${timestamp}.json`;
fs.writeFileSync(
profilesFile,
JSON.stringify(
{
targetUsername,
scrapedAt: new Date().toISOString(),
totalProfiles: profilesData.length,
profiles: profilesData,
},
null,
2
)
);
console.log(`Profiles data saved to: ${profilesFile}\n`);
// Calculate total time
const totalEndTime = Date.now();
const totalTime = ((totalEndTime - totalStartTime) / 1000).toFixed(2);
const totalMinutes = Math.floor(totalTime / 60);
const totalSeconds = (totalTime % 60).toFixed(2);
// Step 7: Summary
console.log("=".repeat(60));
console.log("📊 SCRAPING SUMMARY");
console.log("=".repeat(60));
console.log(`✓ Logged in successfully`);
console.log(`✓ Session cookies saved`);
console.log(
`${followingsData.fullData.length} followings captured in ${followingsTime}s`
);
console.log(
`${profilesData.length} profiles scraped in ${profilesTime}s`
);
console.log(`\n📁 Files created:`);
console.log(`${followingsFile}`);
console.log(`${profilesFile}`);
console.log(` • session_cookies.json`);
console.log(
`\n⏱️ Total execution time: ${totalMinutes}m ${totalSeconds}s`
);
console.log("=".repeat(60) + "\n");
return {
success: true,
followingsCount: followingsData.fullData.length,
profilesCount: profilesData.length,
followingsData: followingsData.fullData,
profilesData,
session,
timings: {
followingsTime: parseFloat(followingsTime),
profilesTime: parseFloat(profilesTime),
totalTime: parseFloat(totalTime),
},
};
} catch (error) {
console.error("\nScraping workflow failed:");
console.error(error.message);
console.error(error.stack);
throw error;
} finally {
if (browser) {
console.log("Closing browser...");
await browser.close();
console.log("Browser closed\n");
}
}
}
// Alternative: Use the built-in scrapeWorkflow function
async function simpleWorkflow() {
console.log("Starting Simple Scraping Workflow (using scrapeWorkflow)...\n");
const credentials = {
username: process.env.INSTAGRAM_USERNAME || "your_username",
password: process.env.INSTAGRAM_PASSWORD || "your_password",
};
const targetUsername = process.env.TARGET_USERNAME || "instagram";
const maxFollowing = parseInt(process.env.MAX_FOLLOWING || "20", 10);
const proxy = process.env.PROXY || null;
try {
console.log(`Target: @${targetUsername}`);
console.log(`Max following to scrape: ${maxFollowing}`);
console.log(`Using proxy: ${proxy || "None"}\n`);
const result = await scrapeWorkflow(
credentials,
targetUsername,
proxy,
maxFollowing
);
console.log("\nScraping completed successfully!");
console.log(`Total profiles scraped: ${result.scrapedProfiles}`);
console.log(
`Full following data captured: ${result.followingsFullData.length} users`
);
// Save the data
if (result.followingsFullData.length > 0) {
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
const filename = `scraped_data_${targetUsername}_${timestamp}.json`;
fs.writeFileSync(
filename,
JSON.stringify(
{
targetUsername,
scrapedAt: new Date().toISOString(),
totalUsers: result.followingsFullData.length,
data: result.followingsFullData,
},
null,
2
)
);
console.log(`Data saved to: ${filename}`);
}
return result;
} catch (error) {
console.error("\nScraping failed:");
console.error(error.message);
throw error;
}
}
// Scheduled scraping with cron
async function scheduledScraping() {
console.log("Starting Scheduled Scraping...\n");
const credentials = {
username: process.env.INSTAGRAM_USERNAME || "your_username",
password: process.env.INSTAGRAM_PASSWORD || "your_password",
};
const targetUsername = process.env.TARGET_USERNAME || "instagram";
const intervalMinutes = parseInt(process.env.SCRAPE_INTERVAL || "60", 10);
const maxRuns = parseInt(process.env.MAX_RUNS || "5", 10);
console.log(
`Will scrape @${targetUsername} every ${intervalMinutes} minutes`
);
console.log(`Maximum runs: ${maxRuns}\n`);
let runCount = 0;
const stopCron = await cronJobs(
async () => {
runCount++;
console.log(`\n${"=".repeat(60)}`);
console.log(
`📅 Scheduled Run #${runCount} - ${new Date().toLocaleString()}`
);
console.log("=".repeat(60));
try {
await simpleWorkflow();
} catch (error) {
console.error(`Run #${runCount} failed:`, error.message);
}
if (runCount >= maxRuns) {
console.log(`\nCompleted ${maxRuns} scheduled runs. Stopping...`);
process.exit(0);
}
},
intervalMinutes * 60, // Convert to seconds
maxRuns
);
console.log("Cron job started. Press Ctrl+C to stop.\n");
}
// Main entry point
if (require.main === module) {
const mode = process.env.MODE || "full"; // full, simple, or scheduled
console.log(`Mode: ${mode}\n`);
let workflow;
if (mode === "simple") {
workflow = simpleWorkflow();
} else if (mode === "scheduled") {
workflow = scheduledScraping();
} else {
workflow = fullScrapingWorkflow();
}
workflow
.then(() => {
console.log("All done!");
process.exit(0);
})
.catch((err) => {
console.error("\nFatal error:", err);
process.exit(1);
});
}
module.exports = {
fullScrapingWorkflow,
simpleWorkflow,
scheduledScraping,
};