feat: Instagram scraper with GraphQL API integration - Automated followings list extraction via API interception - Profile scraping using GraphQL endpoint interception - DOM fallback for edge cases - Performance timing for all operations - Anti-bot measures and human-like behavior simulation

2025-10-31 23:06:06 +05:45
parent ba2dcec881
commit 6f4f37bee5
8 changed files with 3474 additions and 0 deletions
--- a/server.js
+++ b/server.js
@@ -0,0 +1,356 @@
+const {
+  loginWithSession,
+  extractSession,
+  scrapeWorkflow,
+  getFollowingsList,
+  scrapeProfile,
+  cronJobs,
+} = require("./scraper.js");
+const { randomSleep, simulateHumanBehavior } = require("./utils.js");
+const fs = require("fs");
+require("dotenv").config();
+
+// Full workflow: Login, browse, scrape followings and profiles
+async function fullScrapingWorkflow() {
+  console.log("Starting Instagram Full Scraping Workflow...\n");
+
+  // Start total timer
+  const totalStartTime = Date.now();
+
+  const credentials = {
+    username: process.env.INSTAGRAM_USERNAME || "your_username",
+    password: process.env.INSTAGRAM_PASSWORD || "your_password",
+  };
+
+  const targetUsername = process.env.TARGET_USERNAME || "instagram";
+  const maxFollowing = parseInt(process.env.MAX_FOLLOWING || "20", 10);
+  const maxProfilesToScrape = parseInt(process.env.MAX_PROFILES || "5", 10);
+  const proxy = process.env.PROXY || null;
+
+  let browser, page;
+
+  try {
+    console.log("Configuration:");
+    console.log(`   Target: @${targetUsername}`);
+    console.log(`   Max following to fetch: ${maxFollowing}`);
+    console.log(`   Max profiles to scrape: ${maxProfilesToScrape}`);
+    console.log(`   Proxy: ${proxy || "None"}\n`);
+
+    // Step 1: Login (with session reuse)
+    console.log("Step 1: Logging in to Instagram...");
+    const loginResult = await loginWithSession(credentials, proxy, true);
+    browser = loginResult.browser;
+    page = loginResult.page;
+
+    if (loginResult.sessionReused) {
+      console.log("Reused existing session!\n");
+    } else {
+      console.log("Fresh login successful!\n");
+    }
+
+    // Step 2: Extract and save session
+    console.log("Step 2: Extracting session cookies...");
+    const session = await extractSession(page);
+    fs.writeFileSync("session_cookies.json", JSON.stringify(session, null, 2));
+    console.log(`Session saved (${session.cookies.length} cookies)\n`);
+
+    // Step 3: Simulate browsing before scraping
+    console.log("Step 3: Simulating human browsing behavior...");
+    await simulateHumanBehavior(page, { mouseMovements: 5, scrolls: 3 });
+    await randomSleep(2000, 4000);
+    console.log("Browsing simulation complete\n");
+
+    // Step 4: Get followings list
+    console.log(`👥 Step 4: Fetching following list for @${targetUsername}...`);
+    const followingsStartTime = Date.now();
+
+    const followingsData = await getFollowingsList(
+      page,
+      targetUsername,
+      maxFollowing
+    );
+
+    const followingsEndTime = Date.now();
+    const followingsTime = (
+      (followingsEndTime - followingsStartTime) /
+      1000
+    ).toFixed(2);
+
+    console.log(
+      `✓ Captured ${followingsData.fullData.length} followings in ${followingsTime}s\n`
+    );
+
+    // Save followings data
+    const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
+    const followingsFile = `followings_${targetUsername}_${timestamp}.json`;
+    fs.writeFileSync(
+      followingsFile,
+      JSON.stringify(
+        {
+          targetUsername,
+          scrapedAt: new Date().toISOString(),
+          totalFollowings: followingsData.fullData.length,
+          followings: followingsData.fullData,
+        },
+        null,
+        2
+      )
+    );
+    console.log(`Followings data saved to: ${followingsFile}\n`);
+
+    // Step 5: Scrape individual profiles
+    console.log(
+      `📊 Step 5: Scraping ${maxProfilesToScrape} individual profiles...`
+    );
+    const profilesStartTime = Date.now();
+    const profilesData = [];
+    const usernamesToScrape = followingsData.usernames.slice(
+      0,
+      maxProfilesToScrape
+    );
+
+    for (let i = 0; i < usernamesToScrape.length; i++) {
+      const username = usernamesToScrape[i];
+      console.log(
+        `   [${i + 1}/${usernamesToScrape.length}] Scraping @${username}...`
+      );
+
+      try {
+        const profileData = await scrapeProfile(page, username);
+        profilesData.push(profileData);
+        console.log(`   @${username}: ${profileData.followerCount} followers`);
+
+        // Human-like delay between profiles
+        await randomSleep(3000, 6000);
+
+        // Take a longer break every 3 profiles
+        if ((i + 1) % 3 === 0 && i < usernamesToScrape.length - 1) {
+          console.log("   ⏸ Taking a human-like break...");
+          await simulateHumanBehavior(page, { mouseMovements: 4, scrolls: 2 });
+          await randomSleep(8000, 12000);
+        }
+      } catch (error) {
+        console.log(`   Failed to scrape @${username}: ${error.message}`);
+      }
+    }
+
+    const profilesEndTime = Date.now();
+    const profilesTime = ((profilesEndTime - profilesStartTime) / 1000).toFixed(
+      2
+    );
+
+    console.log(
+      `\n✓ Scraped ${profilesData.length} profiles in ${profilesTime}s\n`
+    );
+
+    // Step 6: Save profiles data
+    console.log("Step 6: Saving profile data...");
+    const profilesFile = `profiles_${targetUsername}_${timestamp}.json`;
+    fs.writeFileSync(
+      profilesFile,
+      JSON.stringify(
+        {
+          targetUsername,
+          scrapedAt: new Date().toISOString(),
+          totalProfiles: profilesData.length,
+          profiles: profilesData,
+        },
+        null,
+        2
+      )
+    );
+    console.log(`Profiles data saved to: ${profilesFile}\n`);
+
+    // Calculate total time
+    const totalEndTime = Date.now();
+    const totalTime = ((totalEndTime - totalStartTime) / 1000).toFixed(2);
+    const totalMinutes = Math.floor(totalTime / 60);
+    const totalSeconds = (totalTime % 60).toFixed(2);
+
+    // Step 7: Summary
+    console.log("=".repeat(60));
+    console.log("📊 SCRAPING SUMMARY");
+    console.log("=".repeat(60));
+    console.log(`✓ Logged in successfully`);
+    console.log(`✓ Session cookies saved`);
+    console.log(
+      `✓ ${followingsData.fullData.length} followings captured in ${followingsTime}s`
+    );
+    console.log(
+      `✓ ${profilesData.length} profiles scraped in ${profilesTime}s`
+    );
+    console.log(`\n📁 Files created:`);
+    console.log(`   • ${followingsFile}`);
+    console.log(`   • ${profilesFile}`);
+    console.log(`   • session_cookies.json`);
+    console.log(
+      `\n⏱️  Total execution time: ${totalMinutes}m ${totalSeconds}s`
+    );
+    console.log("=".repeat(60) + "\n");
+
+    return {
+      success: true,
+      followingsCount: followingsData.fullData.length,
+      profilesCount: profilesData.length,
+      followingsData: followingsData.fullData,
+      profilesData,
+      session,
+      timings: {
+        followingsTime: parseFloat(followingsTime),
+        profilesTime: parseFloat(profilesTime),
+        totalTime: parseFloat(totalTime),
+      },
+    };
+  } catch (error) {
+    console.error("\nScraping workflow failed:");
+    console.error(error.message);
+    console.error(error.stack);
+    throw error;
+  } finally {
+    if (browser) {
+      console.log("Closing browser...");
+      await browser.close();
+      console.log("Browser closed\n");
+    }
+  }
+}
+
+// Alternative: Use the built-in scrapeWorkflow function
+async function simpleWorkflow() {
+  console.log("Starting Simple Scraping Workflow (using scrapeWorkflow)...\n");
+
+  const credentials = {
+    username: process.env.INSTAGRAM_USERNAME || "your_username",
+    password: process.env.INSTAGRAM_PASSWORD || "your_password",
+  };
+
+  const targetUsername = process.env.TARGET_USERNAME || "instagram";
+  const maxFollowing = parseInt(process.env.MAX_FOLLOWING || "20", 10);
+  const proxy = process.env.PROXY || null;
+
+  try {
+    console.log(`Target: @${targetUsername}`);
+    console.log(`Max following to scrape: ${maxFollowing}`);
+    console.log(`Using proxy: ${proxy || "None"}\n`);
+
+    const result = await scrapeWorkflow(
+      credentials,
+      targetUsername,
+      proxy,
+      maxFollowing
+    );
+
+    console.log("\nScraping completed successfully!");
+    console.log(`Total profiles scraped: ${result.scrapedProfiles}`);
+    console.log(
+      `Full following data captured: ${result.followingsFullData.length} users`
+    );
+
+    // Save the data
+    if (result.followingsFullData.length > 0) {
+      const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
+      const filename = `scraped_data_${targetUsername}_${timestamp}.json`;
+
+      fs.writeFileSync(
+        filename,
+        JSON.stringify(
+          {
+            targetUsername,
+            scrapedAt: new Date().toISOString(),
+            totalUsers: result.followingsFullData.length,
+            data: result.followingsFullData,
+          },
+          null,
+          2
+        )
+      );
+
+      console.log(`Data saved to: ${filename}`);
+    }
+
+    return result;
+  } catch (error) {
+    console.error("\nScraping failed:");
+    console.error(error.message);
+    throw error;
+  }
+}
+
+// Scheduled scraping with cron
+async function scheduledScraping() {
+  console.log("Starting Scheduled Scraping...\n");
+
+  const credentials = {
+    username: process.env.INSTAGRAM_USERNAME || "your_username",
+    password: process.env.INSTAGRAM_PASSWORD || "your_password",
+  };
+
+  const targetUsername = process.env.TARGET_USERNAME || "instagram";
+  const intervalMinutes = parseInt(process.env.SCRAPE_INTERVAL || "60", 10);
+  const maxRuns = parseInt(process.env.MAX_RUNS || "5", 10);
+
+  console.log(
+    `Will scrape @${targetUsername} every ${intervalMinutes} minutes`
+  );
+  console.log(`Maximum runs: ${maxRuns}\n`);
+
+  let runCount = 0;
+
+  const stopCron = await cronJobs(
+    async () => {
+      runCount++;
+      console.log(`\n${"=".repeat(60)}`);
+      console.log(
+        `📅 Scheduled Run #${runCount} - ${new Date().toLocaleString()}`
+      );
+      console.log("=".repeat(60));
+
+      try {
+        await simpleWorkflow();
+      } catch (error) {
+        console.error(`Run #${runCount} failed:`, error.message);
+      }
+
+      if (runCount >= maxRuns) {
+        console.log(`\nCompleted ${maxRuns} scheduled runs. Stopping...`);
+        process.exit(0);
+      }
+    },
+    intervalMinutes * 60, // Convert to seconds
+    maxRuns
+  );
+
+  console.log("Cron job started. Press Ctrl+C to stop.\n");
+}
+
+// Main entry point
+if (require.main === module) {
+  const mode = process.env.MODE || "full"; // full, simple, or scheduled
+
+  console.log(`Mode: ${mode}\n`);
+
+  let workflow;
+  if (mode === "simple") {
+    workflow = simpleWorkflow();
+  } else if (mode === "scheduled") {
+    workflow = scheduledScraping();
+  } else {
+    workflow = fullScrapingWorkflow();
+  }
+
+  workflow
+    .then(() => {
+      console.log("All done!");
+      process.exit(0);
+    })
+    .catch((err) => {
+      console.error("\nFatal error:", err);
+      process.exit(1);
+    });
+}
+
+module.exports = {
+  fullScrapingWorkflow,
+  simpleWorkflow,
+  scheduledScraping,
+};