357 lines
10 KiB
JavaScript
357 lines
10 KiB
JavaScript
const {
|
|
loginWithSession,
|
|
extractSession,
|
|
scrapeWorkflow,
|
|
getFollowingsList,
|
|
scrapeProfile,
|
|
cronJobs,
|
|
} = require("./scraper.js");
|
|
const { randomSleep, simulateHumanBehavior } = require("./utils.js");
|
|
const fs = require("fs");
|
|
require("dotenv").config();
|
|
|
|
// Full workflow: Login, browse, scrape followings and profiles
|
|
async function fullScrapingWorkflow() {
|
|
console.log("Starting Instagram Full Scraping Workflow...\n");
|
|
|
|
// Start total timer
|
|
const totalStartTime = Date.now();
|
|
|
|
const credentials = {
|
|
username: process.env.INSTAGRAM_USERNAME || "your_username",
|
|
password: process.env.INSTAGRAM_PASSWORD || "your_password",
|
|
};
|
|
|
|
const targetUsername = process.env.TARGET_USERNAME || "instagram";
|
|
const maxFollowing = parseInt(process.env.MAX_FOLLOWING || "20", 10);
|
|
const maxProfilesToScrape = parseInt(process.env.MAX_PROFILES || "5", 10);
|
|
const proxy = process.env.PROXY || null;
|
|
|
|
let browser, page;
|
|
|
|
try {
|
|
console.log("Configuration:");
|
|
console.log(` Target: @${targetUsername}`);
|
|
console.log(` Max following to fetch: ${maxFollowing}`);
|
|
console.log(` Max profiles to scrape: ${maxProfilesToScrape}`);
|
|
console.log(` Proxy: ${proxy || "None"}\n`);
|
|
|
|
// Step 1: Login (with session reuse)
|
|
console.log("Step 1: Logging in to Instagram...");
|
|
const loginResult = await loginWithSession(credentials, proxy, true);
|
|
browser = loginResult.browser;
|
|
page = loginResult.page;
|
|
|
|
if (loginResult.sessionReused) {
|
|
console.log("Reused existing session!\n");
|
|
} else {
|
|
console.log("Fresh login successful!\n");
|
|
}
|
|
|
|
// Step 2: Extract and save session
|
|
console.log("Step 2: Extracting session cookies...");
|
|
const session = await extractSession(page);
|
|
fs.writeFileSync("session_cookies.json", JSON.stringify(session, null, 2));
|
|
console.log(`Session saved (${session.cookies.length} cookies)\n`);
|
|
|
|
// Step 3: Simulate browsing before scraping
|
|
console.log("Step 3: Simulating human browsing behavior...");
|
|
await simulateHumanBehavior(page, { mouseMovements: 5, scrolls: 3 });
|
|
await randomSleep(2000, 4000);
|
|
console.log("Browsing simulation complete\n");
|
|
|
|
// Step 4: Get followings list
|
|
console.log(`👥 Step 4: Fetching following list for @${targetUsername}...`);
|
|
const followingsStartTime = Date.now();
|
|
|
|
const followingsData = await getFollowingsList(
|
|
page,
|
|
targetUsername,
|
|
maxFollowing
|
|
);
|
|
|
|
const followingsEndTime = Date.now();
|
|
const followingsTime = (
|
|
(followingsEndTime - followingsStartTime) /
|
|
1000
|
|
).toFixed(2);
|
|
|
|
console.log(
|
|
`✓ Captured ${followingsData.fullData.length} followings in ${followingsTime}s\n`
|
|
);
|
|
|
|
// Save followings data
|
|
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
|
|
const followingsFile = `followings_${targetUsername}_${timestamp}.json`;
|
|
fs.writeFileSync(
|
|
followingsFile,
|
|
JSON.stringify(
|
|
{
|
|
targetUsername,
|
|
scrapedAt: new Date().toISOString(),
|
|
totalFollowings: followingsData.fullData.length,
|
|
followings: followingsData.fullData,
|
|
},
|
|
null,
|
|
2
|
|
)
|
|
);
|
|
console.log(`Followings data saved to: ${followingsFile}\n`);
|
|
|
|
// Step 5: Scrape individual profiles
|
|
console.log(
|
|
`📊 Step 5: Scraping ${maxProfilesToScrape} individual profiles...`
|
|
);
|
|
const profilesStartTime = Date.now();
|
|
const profilesData = [];
|
|
const usernamesToScrape = followingsData.usernames.slice(
|
|
0,
|
|
maxProfilesToScrape
|
|
);
|
|
|
|
for (let i = 0; i < usernamesToScrape.length; i++) {
|
|
const username = usernamesToScrape[i];
|
|
console.log(
|
|
` [${i + 1}/${usernamesToScrape.length}] Scraping @${username}...`
|
|
);
|
|
|
|
try {
|
|
const profileData = await scrapeProfile(page, username);
|
|
profilesData.push(profileData);
|
|
console.log(` @${username}: ${profileData.followerCount} followers`);
|
|
|
|
// Human-like delay between profiles
|
|
await randomSleep(3000, 6000);
|
|
|
|
// Take a longer break every 3 profiles
|
|
if ((i + 1) % 3 === 0 && i < usernamesToScrape.length - 1) {
|
|
console.log(" ⏸ Taking a human-like break...");
|
|
await simulateHumanBehavior(page, { mouseMovements: 4, scrolls: 2 });
|
|
await randomSleep(8000, 12000);
|
|
}
|
|
} catch (error) {
|
|
console.log(` Failed to scrape @${username}: ${error.message}`);
|
|
}
|
|
}
|
|
|
|
const profilesEndTime = Date.now();
|
|
const profilesTime = ((profilesEndTime - profilesStartTime) / 1000).toFixed(
|
|
2
|
|
);
|
|
|
|
console.log(
|
|
`\n✓ Scraped ${profilesData.length} profiles in ${profilesTime}s\n`
|
|
);
|
|
|
|
// Step 6: Save profiles data
|
|
console.log("Step 6: Saving profile data...");
|
|
const profilesFile = `profiles_${targetUsername}_${timestamp}.json`;
|
|
fs.writeFileSync(
|
|
profilesFile,
|
|
JSON.stringify(
|
|
{
|
|
targetUsername,
|
|
scrapedAt: new Date().toISOString(),
|
|
totalProfiles: profilesData.length,
|
|
profiles: profilesData,
|
|
},
|
|
null,
|
|
2
|
|
)
|
|
);
|
|
console.log(`Profiles data saved to: ${profilesFile}\n`);
|
|
|
|
// Calculate total time
|
|
const totalEndTime = Date.now();
|
|
const totalTime = ((totalEndTime - totalStartTime) / 1000).toFixed(2);
|
|
const totalMinutes = Math.floor(totalTime / 60);
|
|
const totalSeconds = (totalTime % 60).toFixed(2);
|
|
|
|
// Step 7: Summary
|
|
console.log("=".repeat(60));
|
|
console.log("📊 SCRAPING SUMMARY");
|
|
console.log("=".repeat(60));
|
|
console.log(`✓ Logged in successfully`);
|
|
console.log(`✓ Session cookies saved`);
|
|
console.log(
|
|
`✓ ${followingsData.fullData.length} followings captured in ${followingsTime}s`
|
|
);
|
|
console.log(
|
|
`✓ ${profilesData.length} profiles scraped in ${profilesTime}s`
|
|
);
|
|
console.log(`\n📁 Files created:`);
|
|
console.log(` • ${followingsFile}`);
|
|
console.log(` • ${profilesFile}`);
|
|
console.log(` • session_cookies.json`);
|
|
console.log(
|
|
`\n⏱️ Total execution time: ${totalMinutes}m ${totalSeconds}s`
|
|
);
|
|
console.log("=".repeat(60) + "\n");
|
|
|
|
return {
|
|
success: true,
|
|
followingsCount: followingsData.fullData.length,
|
|
profilesCount: profilesData.length,
|
|
followingsData: followingsData.fullData,
|
|
profilesData,
|
|
session,
|
|
timings: {
|
|
followingsTime: parseFloat(followingsTime),
|
|
profilesTime: parseFloat(profilesTime),
|
|
totalTime: parseFloat(totalTime),
|
|
},
|
|
};
|
|
} catch (error) {
|
|
console.error("\nScraping workflow failed:");
|
|
console.error(error.message);
|
|
console.error(error.stack);
|
|
throw error;
|
|
} finally {
|
|
if (browser) {
|
|
console.log("Closing browser...");
|
|
await browser.close();
|
|
console.log("Browser closed\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
// Alternative: Use the built-in scrapeWorkflow function
|
|
async function simpleWorkflow() {
|
|
console.log("Starting Simple Scraping Workflow (using scrapeWorkflow)...\n");
|
|
|
|
const credentials = {
|
|
username: process.env.INSTAGRAM_USERNAME || "your_username",
|
|
password: process.env.INSTAGRAM_PASSWORD || "your_password",
|
|
};
|
|
|
|
const targetUsername = process.env.TARGET_USERNAME || "instagram";
|
|
const maxFollowing = parseInt(process.env.MAX_FOLLOWING || "20", 10);
|
|
const proxy = process.env.PROXY || null;
|
|
|
|
try {
|
|
console.log(`Target: @${targetUsername}`);
|
|
console.log(`Max following to scrape: ${maxFollowing}`);
|
|
console.log(`Using proxy: ${proxy || "None"}\n`);
|
|
|
|
const result = await scrapeWorkflow(
|
|
credentials,
|
|
targetUsername,
|
|
proxy,
|
|
maxFollowing
|
|
);
|
|
|
|
console.log("\nScraping completed successfully!");
|
|
console.log(`Total profiles scraped: ${result.scrapedProfiles}`);
|
|
console.log(
|
|
`Full following data captured: ${result.followingsFullData.length} users`
|
|
);
|
|
|
|
// Save the data
|
|
if (result.followingsFullData.length > 0) {
|
|
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
|
|
const filename = `scraped_data_${targetUsername}_${timestamp}.json`;
|
|
|
|
fs.writeFileSync(
|
|
filename,
|
|
JSON.stringify(
|
|
{
|
|
targetUsername,
|
|
scrapedAt: new Date().toISOString(),
|
|
totalUsers: result.followingsFullData.length,
|
|
data: result.followingsFullData,
|
|
},
|
|
null,
|
|
2
|
|
)
|
|
);
|
|
|
|
console.log(`Data saved to: ${filename}`);
|
|
}
|
|
|
|
return result;
|
|
} catch (error) {
|
|
console.error("\nScraping failed:");
|
|
console.error(error.message);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
// Scheduled scraping with cron
|
|
async function scheduledScraping() {
|
|
console.log("Starting Scheduled Scraping...\n");
|
|
|
|
const credentials = {
|
|
username: process.env.INSTAGRAM_USERNAME || "your_username",
|
|
password: process.env.INSTAGRAM_PASSWORD || "your_password",
|
|
};
|
|
|
|
const targetUsername = process.env.TARGET_USERNAME || "instagram";
|
|
const intervalMinutes = parseInt(process.env.SCRAPE_INTERVAL || "60", 10);
|
|
const maxRuns = parseInt(process.env.MAX_RUNS || "5", 10);
|
|
|
|
console.log(
|
|
`Will scrape @${targetUsername} every ${intervalMinutes} minutes`
|
|
);
|
|
console.log(`Maximum runs: ${maxRuns}\n`);
|
|
|
|
let runCount = 0;
|
|
|
|
const stopCron = await cronJobs(
|
|
async () => {
|
|
runCount++;
|
|
console.log(`\n${"=".repeat(60)}`);
|
|
console.log(
|
|
`📅 Scheduled Run #${runCount} - ${new Date().toLocaleString()}`
|
|
);
|
|
console.log("=".repeat(60));
|
|
|
|
try {
|
|
await simpleWorkflow();
|
|
} catch (error) {
|
|
console.error(`Run #${runCount} failed:`, error.message);
|
|
}
|
|
|
|
if (runCount >= maxRuns) {
|
|
console.log(`\nCompleted ${maxRuns} scheduled runs. Stopping...`);
|
|
process.exit(0);
|
|
}
|
|
},
|
|
intervalMinutes * 60, // Convert to seconds
|
|
maxRuns
|
|
);
|
|
|
|
console.log("Cron job started. Press Ctrl+C to stop.\n");
|
|
}
|
|
|
|
// Main entry point
|
|
if (require.main === module) {
|
|
const mode = process.env.MODE || "full"; // full, simple, or scheduled
|
|
|
|
console.log(`Mode: ${mode}\n`);
|
|
|
|
let workflow;
|
|
if (mode === "simple") {
|
|
workflow = simpleWorkflow();
|
|
} else if (mode === "scheduled") {
|
|
workflow = scheduledScraping();
|
|
} else {
|
|
workflow = fullScrapingWorkflow();
|
|
}
|
|
|
|
workflow
|
|
.then(() => {
|
|
console.log("All done!");
|
|
process.exit(0);
|
|
})
|
|
.catch((err) => {
|
|
console.error("\nFatal error:", err);
|
|
process.exit(1);
|
|
});
|
|
}
|
|
|
|
module.exports = {
|
|
fullScrapingWorkflow,
|
|
simpleWorkflow,
|
|
scheduledScraping,
|
|
};
|