feat: Instagram scraper with GraphQL API integration - Automated followings list extraction via API interception - Profile scraping using GraphQL endpoint interception - DOM fallback for edge cases - Performance timing for all operations - Anti-bot measures and human-like behavior simulation
This commit is contained in:
356
server.js
Normal file
356
server.js
Normal file
@@ -0,0 +1,356 @@
|
||||
const {
|
||||
loginWithSession,
|
||||
extractSession,
|
||||
scrapeWorkflow,
|
||||
getFollowingsList,
|
||||
scrapeProfile,
|
||||
cronJobs,
|
||||
} = require("./scraper.js");
|
||||
const { randomSleep, simulateHumanBehavior } = require("./utils.js");
|
||||
const fs = require("fs");
|
||||
require("dotenv").config();
|
||||
|
||||
// Full workflow: Login, browse, scrape followings and profiles
|
||||
async function fullScrapingWorkflow() {
|
||||
console.log("Starting Instagram Full Scraping Workflow...\n");
|
||||
|
||||
// Start total timer
|
||||
const totalStartTime = Date.now();
|
||||
|
||||
const credentials = {
|
||||
username: process.env.INSTAGRAM_USERNAME || "your_username",
|
||||
password: process.env.INSTAGRAM_PASSWORD || "your_password",
|
||||
};
|
||||
|
||||
const targetUsername = process.env.TARGET_USERNAME || "instagram";
|
||||
const maxFollowing = parseInt(process.env.MAX_FOLLOWING || "20", 10);
|
||||
const maxProfilesToScrape = parseInt(process.env.MAX_PROFILES || "5", 10);
|
||||
const proxy = process.env.PROXY || null;
|
||||
|
||||
let browser, page;
|
||||
|
||||
try {
|
||||
console.log("Configuration:");
|
||||
console.log(` Target: @${targetUsername}`);
|
||||
console.log(` Max following to fetch: ${maxFollowing}`);
|
||||
console.log(` Max profiles to scrape: ${maxProfilesToScrape}`);
|
||||
console.log(` Proxy: ${proxy || "None"}\n`);
|
||||
|
||||
// Step 1: Login (with session reuse)
|
||||
console.log("Step 1: Logging in to Instagram...");
|
||||
const loginResult = await loginWithSession(credentials, proxy, true);
|
||||
browser = loginResult.browser;
|
||||
page = loginResult.page;
|
||||
|
||||
if (loginResult.sessionReused) {
|
||||
console.log("Reused existing session!\n");
|
||||
} else {
|
||||
console.log("Fresh login successful!\n");
|
||||
}
|
||||
|
||||
// Step 2: Extract and save session
|
||||
console.log("Step 2: Extracting session cookies...");
|
||||
const session = await extractSession(page);
|
||||
fs.writeFileSync("session_cookies.json", JSON.stringify(session, null, 2));
|
||||
console.log(`Session saved (${session.cookies.length} cookies)\n`);
|
||||
|
||||
// Step 3: Simulate browsing before scraping
|
||||
console.log("Step 3: Simulating human browsing behavior...");
|
||||
await simulateHumanBehavior(page, { mouseMovements: 5, scrolls: 3 });
|
||||
await randomSleep(2000, 4000);
|
||||
console.log("Browsing simulation complete\n");
|
||||
|
||||
// Step 4: Get followings list
|
||||
console.log(`👥 Step 4: Fetching following list for @${targetUsername}...`);
|
||||
const followingsStartTime = Date.now();
|
||||
|
||||
const followingsData = await getFollowingsList(
|
||||
page,
|
||||
targetUsername,
|
||||
maxFollowing
|
||||
);
|
||||
|
||||
const followingsEndTime = Date.now();
|
||||
const followingsTime = (
|
||||
(followingsEndTime - followingsStartTime) /
|
||||
1000
|
||||
).toFixed(2);
|
||||
|
||||
console.log(
|
||||
`✓ Captured ${followingsData.fullData.length} followings in ${followingsTime}s\n`
|
||||
);
|
||||
|
||||
// Save followings data
|
||||
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
|
||||
const followingsFile = `followings_${targetUsername}_${timestamp}.json`;
|
||||
fs.writeFileSync(
|
||||
followingsFile,
|
||||
JSON.stringify(
|
||||
{
|
||||
targetUsername,
|
||||
scrapedAt: new Date().toISOString(),
|
||||
totalFollowings: followingsData.fullData.length,
|
||||
followings: followingsData.fullData,
|
||||
},
|
||||
null,
|
||||
2
|
||||
)
|
||||
);
|
||||
console.log(`Followings data saved to: ${followingsFile}\n`);
|
||||
|
||||
// Step 5: Scrape individual profiles
|
||||
console.log(
|
||||
`📊 Step 5: Scraping ${maxProfilesToScrape} individual profiles...`
|
||||
);
|
||||
const profilesStartTime = Date.now();
|
||||
const profilesData = [];
|
||||
const usernamesToScrape = followingsData.usernames.slice(
|
||||
0,
|
||||
maxProfilesToScrape
|
||||
);
|
||||
|
||||
for (let i = 0; i < usernamesToScrape.length; i++) {
|
||||
const username = usernamesToScrape[i];
|
||||
console.log(
|
||||
` [${i + 1}/${usernamesToScrape.length}] Scraping @${username}...`
|
||||
);
|
||||
|
||||
try {
|
||||
const profileData = await scrapeProfile(page, username);
|
||||
profilesData.push(profileData);
|
||||
console.log(` @${username}: ${profileData.followerCount} followers`);
|
||||
|
||||
// Human-like delay between profiles
|
||||
await randomSleep(3000, 6000);
|
||||
|
||||
// Take a longer break every 3 profiles
|
||||
if ((i + 1) % 3 === 0 && i < usernamesToScrape.length - 1) {
|
||||
console.log(" ⏸ Taking a human-like break...");
|
||||
await simulateHumanBehavior(page, { mouseMovements: 4, scrolls: 2 });
|
||||
await randomSleep(8000, 12000);
|
||||
}
|
||||
} catch (error) {
|
||||
console.log(` Failed to scrape @${username}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
const profilesEndTime = Date.now();
|
||||
const profilesTime = ((profilesEndTime - profilesStartTime) / 1000).toFixed(
|
||||
2
|
||||
);
|
||||
|
||||
console.log(
|
||||
`\n✓ Scraped ${profilesData.length} profiles in ${profilesTime}s\n`
|
||||
);
|
||||
|
||||
// Step 6: Save profiles data
|
||||
console.log("Step 6: Saving profile data...");
|
||||
const profilesFile = `profiles_${targetUsername}_${timestamp}.json`;
|
||||
fs.writeFileSync(
|
||||
profilesFile,
|
||||
JSON.stringify(
|
||||
{
|
||||
targetUsername,
|
||||
scrapedAt: new Date().toISOString(),
|
||||
totalProfiles: profilesData.length,
|
||||
profiles: profilesData,
|
||||
},
|
||||
null,
|
||||
2
|
||||
)
|
||||
);
|
||||
console.log(`Profiles data saved to: ${profilesFile}\n`);
|
||||
|
||||
// Calculate total time
|
||||
const totalEndTime = Date.now();
|
||||
const totalTime = ((totalEndTime - totalStartTime) / 1000).toFixed(2);
|
||||
const totalMinutes = Math.floor(totalTime / 60);
|
||||
const totalSeconds = (totalTime % 60).toFixed(2);
|
||||
|
||||
// Step 7: Summary
|
||||
console.log("=".repeat(60));
|
||||
console.log("📊 SCRAPING SUMMARY");
|
||||
console.log("=".repeat(60));
|
||||
console.log(`✓ Logged in successfully`);
|
||||
console.log(`✓ Session cookies saved`);
|
||||
console.log(
|
||||
`✓ ${followingsData.fullData.length} followings captured in ${followingsTime}s`
|
||||
);
|
||||
console.log(
|
||||
`✓ ${profilesData.length} profiles scraped in ${profilesTime}s`
|
||||
);
|
||||
console.log(`\n📁 Files created:`);
|
||||
console.log(` • ${followingsFile}`);
|
||||
console.log(` • ${profilesFile}`);
|
||||
console.log(` • session_cookies.json`);
|
||||
console.log(
|
||||
`\n⏱️ Total execution time: ${totalMinutes}m ${totalSeconds}s`
|
||||
);
|
||||
console.log("=".repeat(60) + "\n");
|
||||
|
||||
return {
|
||||
success: true,
|
||||
followingsCount: followingsData.fullData.length,
|
||||
profilesCount: profilesData.length,
|
||||
followingsData: followingsData.fullData,
|
||||
profilesData,
|
||||
session,
|
||||
timings: {
|
||||
followingsTime: parseFloat(followingsTime),
|
||||
profilesTime: parseFloat(profilesTime),
|
||||
totalTime: parseFloat(totalTime),
|
||||
},
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("\nScraping workflow failed:");
|
||||
console.error(error.message);
|
||||
console.error(error.stack);
|
||||
throw error;
|
||||
} finally {
|
||||
if (browser) {
|
||||
console.log("Closing browser...");
|
||||
await browser.close();
|
||||
console.log("Browser closed\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Alternative: Use the built-in scrapeWorkflow function
|
||||
async function simpleWorkflow() {
|
||||
console.log("Starting Simple Scraping Workflow (using scrapeWorkflow)...\n");
|
||||
|
||||
const credentials = {
|
||||
username: process.env.INSTAGRAM_USERNAME || "your_username",
|
||||
password: process.env.INSTAGRAM_PASSWORD || "your_password",
|
||||
};
|
||||
|
||||
const targetUsername = process.env.TARGET_USERNAME || "instagram";
|
||||
const maxFollowing = parseInt(process.env.MAX_FOLLOWING || "20", 10);
|
||||
const proxy = process.env.PROXY || null;
|
||||
|
||||
try {
|
||||
console.log(`Target: @${targetUsername}`);
|
||||
console.log(`Max following to scrape: ${maxFollowing}`);
|
||||
console.log(`Using proxy: ${proxy || "None"}\n`);
|
||||
|
||||
const result = await scrapeWorkflow(
|
||||
credentials,
|
||||
targetUsername,
|
||||
proxy,
|
||||
maxFollowing
|
||||
);
|
||||
|
||||
console.log("\nScraping completed successfully!");
|
||||
console.log(`Total profiles scraped: ${result.scrapedProfiles}`);
|
||||
console.log(
|
||||
`Full following data captured: ${result.followingsFullData.length} users`
|
||||
);
|
||||
|
||||
// Save the data
|
||||
if (result.followingsFullData.length > 0) {
|
||||
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
|
||||
const filename = `scraped_data_${targetUsername}_${timestamp}.json`;
|
||||
|
||||
fs.writeFileSync(
|
||||
filename,
|
||||
JSON.stringify(
|
||||
{
|
||||
targetUsername,
|
||||
scrapedAt: new Date().toISOString(),
|
||||
totalUsers: result.followingsFullData.length,
|
||||
data: result.followingsFullData,
|
||||
},
|
||||
null,
|
||||
2
|
||||
)
|
||||
);
|
||||
|
||||
console.log(`Data saved to: ${filename}`);
|
||||
}
|
||||
|
||||
return result;
|
||||
} catch (error) {
|
||||
console.error("\nScraping failed:");
|
||||
console.error(error.message);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
// Scheduled scraping with cron
|
||||
async function scheduledScraping() {
|
||||
console.log("Starting Scheduled Scraping...\n");
|
||||
|
||||
const credentials = {
|
||||
username: process.env.INSTAGRAM_USERNAME || "your_username",
|
||||
password: process.env.INSTAGRAM_PASSWORD || "your_password",
|
||||
};
|
||||
|
||||
const targetUsername = process.env.TARGET_USERNAME || "instagram";
|
||||
const intervalMinutes = parseInt(process.env.SCRAPE_INTERVAL || "60", 10);
|
||||
const maxRuns = parseInt(process.env.MAX_RUNS || "5", 10);
|
||||
|
||||
console.log(
|
||||
`Will scrape @${targetUsername} every ${intervalMinutes} minutes`
|
||||
);
|
||||
console.log(`Maximum runs: ${maxRuns}\n`);
|
||||
|
||||
let runCount = 0;
|
||||
|
||||
const stopCron = await cronJobs(
|
||||
async () => {
|
||||
runCount++;
|
||||
console.log(`\n${"=".repeat(60)}`);
|
||||
console.log(
|
||||
`📅 Scheduled Run #${runCount} - ${new Date().toLocaleString()}`
|
||||
);
|
||||
console.log("=".repeat(60));
|
||||
|
||||
try {
|
||||
await simpleWorkflow();
|
||||
} catch (error) {
|
||||
console.error(`Run #${runCount} failed:`, error.message);
|
||||
}
|
||||
|
||||
if (runCount >= maxRuns) {
|
||||
console.log(`\nCompleted ${maxRuns} scheduled runs. Stopping...`);
|
||||
process.exit(0);
|
||||
}
|
||||
},
|
||||
intervalMinutes * 60, // Convert to seconds
|
||||
maxRuns
|
||||
);
|
||||
|
||||
console.log("Cron job started. Press Ctrl+C to stop.\n");
|
||||
}
|
||||
|
||||
// Main entry point
|
||||
if (require.main === module) {
|
||||
const mode = process.env.MODE || "full"; // full, simple, or scheduled
|
||||
|
||||
console.log(`Mode: ${mode}\n`);
|
||||
|
||||
let workflow;
|
||||
if (mode === "simple") {
|
||||
workflow = simpleWorkflow();
|
||||
} else if (mode === "scheduled") {
|
||||
workflow = scheduledScraping();
|
||||
} else {
|
||||
workflow = fullScrapingWorkflow();
|
||||
}
|
||||
|
||||
workflow
|
||||
.then(() => {
|
||||
console.log("All done!");
|
||||
process.exit(0);
|
||||
})
|
||||
.catch((err) => {
|
||||
console.error("\nFatal error:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
fullScrapingWorkflow,
|
||||
simpleWorkflow,
|
||||
scheduledScraping,
|
||||
};
|
||||
Reference in New Issue
Block a user