From 708c9e39bbf9cbd5c40eaaa746a09e8ede5b988c Mon Sep 17 00:00:00 2001 From: WhatDidYouExpect <89535984+WhatDidYouExpect@users.noreply.github.com> Date: Tue, 21 Jan 2025 16:46:52 +0100 Subject: [PATCH] Create webscraper.py --- customcommands/webscraper.py | 121 +++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 customcommands/webscraper.py diff --git a/customcommands/webscraper.py b/customcommands/webscraper.py new file mode 100644 index 0000000..ebcf144 --- /dev/null +++ b/customcommands/webscraper.py @@ -0,0 +1,121 @@ +import discord +from discord.ext import commands +import aiohttp +from bs4 import BeautifulSoup +import json +import asyncio +from urllib.parse import urljoin +from config import ownerid +class WebScraper(commands.Cog): + def __init__(self, bot): + self.bot = bot + self.visited_urls = set() + + async def fetch(self, session, url): + """Fetch the HTML content of a URL.""" + try: + async with session.get(url, timeout=10) as response: + return await response.text() + except Exception as e: + print(f"Failed to fetch {url}: {e}") + return None + + def extract_sentences(self, text): + """Extract sentences from text.""" + sentences = text.split('.') + return [sentence.strip() for sentence in sentences if sentence.strip()] + + def save_to_json(self, sentences): + """Save sentences to memory.json.""" + try: + try: + with open("memory.json", "r") as file: + data = json.load(file) + except (FileNotFoundError, json.JSONDecodeError): + data = [] + data.extend(sentences) + with open("memory.json", "w") as file: + json.dump(data, file, indent=4) + except Exception as e: + print(f"Failed to save to JSON: {e}") + + def undo_last_scrape(self): + """Undo the last scrape by removing the most recent sentences.""" + try: + with open("memory.json", "r") as file: + data = json.load(file) + + if not data: + print("No data to undo.") + return False + + + data = data[:-1] + + with open("memory.json", "w") as file: + json.dump(data, file, indent=4) + + return True + except (FileNotFoundError, json.JSONDecodeError): + print("No data to undo or failed to load JSON.") + return False + except Exception as e: + print(f"Failed to undo last scrape: {e}") + return False + + async def scrape_links(self, session, url, depth=2): + """Recursively scrape links from a URL.""" + if depth == 0 or url in self.visited_urls: + return + + print(f"Scraping: {url}") + self.visited_urls.add(url) + + html = await self.fetch(session, url) + if not html: + return + + soup = BeautifulSoup(html, "html.parser") + + for paragraph in soup.find_all('p'): + sentences = self.extract_sentences(paragraph.get_text()) + self.save_to_json(sentences) + + for link in soup.find_all('a', href=True): + full_url = urljoin(url, link['href']) + if full_url.startswith("http") and full_url not in self.visited_urls: + await self.scrape_links(session, full_url, depth - 1) + + @commands.command() + async def start_scrape(self, ctx, start_url: str): + """Command to start the scraping process.""" + if ctx.author.id != ownerid: + await ctx.send("You do not have permission to use this command.") + return + + if not start_url.startswith("http"): + await ctx.send("Please provide a valid URL.") + return + + await ctx.send(f"Starting scrape from {start_url}... This may take a while!") + + async with aiohttp.ClientSession() as session: + await self.scrape_links(session, start_url) + + await ctx.send("Scraping complete! Sentences saved to memory.json.") + + @commands.command() + async def undo_scrape(self, ctx): + """Command to undo the last scrape.""" + if ctx.author.id != ownerid: + await ctx.send("You do not have permission to use this command.") + return + + success = self.undo_last_scrape() + if success: + await ctx.send("Last scrape undone successfully.") + else: + await ctx.send("No data to undo or an error occurred.") + +async def setup(bot): + await bot.add_cog(WebScraper(bot))