changed sentenceprocessing.py to use spaCy instead of NLTK for both aspects
This commit is contained in:
parent
3ae6a301a6
commit
25044d16d4
4 changed files with 26 additions and 60 deletions
|
@ -2,9 +2,6 @@ DISCORD_BOT_TOKEN=token
|
||||||
BOT_PREFIX="g."
|
BOT_PREFIX="g."
|
||||||
PING_LINE="The Beretta fires fast and won't make you feel any better!"
|
PING_LINE="The Beretta fires fast and won't make you feel any better!"
|
||||||
BLACKLISTED_USERS=
|
BLACKLISTED_USERS=
|
||||||
cooldown=10800
|
|
||||||
hourlyspeak=1318263176134918246
|
|
||||||
ownerid=542701119948849163
|
|
||||||
USERTRAIN_ENABLED="true"
|
USERTRAIN_ENABLED="true"
|
||||||
showmemenabled="true"
|
showmemenabled="true"
|
||||||
NAME="an instance of goober"
|
NAME="an instance of goober"
|
||||||
|
|
|
@ -39,5 +39,5 @@ arch = platform.machine()
|
||||||
slash_commands_enabled = False
|
slash_commands_enabled = False
|
||||||
launched = False
|
launched = False
|
||||||
latest_version = "0.0.0"
|
latest_version = "0.0.0"
|
||||||
local_version = "1.0.6"
|
local_version = "2.0.0a1 (spaCy)"
|
||||||
os.environ['gooberlocal_version'] = local_version
|
os.environ['gooberlocal_version'] = local_version
|
||||||
|
|
|
@ -2,59 +2,43 @@ import re
|
||||||
from modules.globalvars import *
|
from modules.globalvars import *
|
||||||
from modules.translations import *
|
from modules.translations import *
|
||||||
|
|
||||||
import nltk
|
import spacy
|
||||||
import nltk.data
|
from spacy.tokens import Doc
|
||||||
|
from spacytextblob.spacytextblob import SpacyTextBlob
|
||||||
|
nlp = spacy.load("en_core_web_sm")
|
||||||
|
nlp.add_pipe("spacytextblob")
|
||||||
|
Doc.set_extension("polarity", getter=lambda doc: doc._.blob.polarity)
|
||||||
|
|
||||||
# Ensure required NLTK resources are available
|
|
||||||
def check_resources():
|
def check_resources():
|
||||||
# Check for required NLTK resources and download if missing
|
try:
|
||||||
resources = {
|
nlp = spacy.load("en_core_web_sm")
|
||||||
'vader_lexicon': 'sentiment/vader_lexicon',
|
except OSError:
|
||||||
'punkt_tab': 'tokenizers/punkt',
|
print("spaCy model not found. Downloading en_core_web_sm...")
|
||||||
}
|
spacy.cli.download("en_core_web_sm")
|
||||||
for resource, path in resources.items():
|
nlp = spacy.load("en_core_web_sm")
|
||||||
try:
|
if "spacytextblob" not in nlp.pipe_names:
|
||||||
nltk.data.find(path)
|
nlp.add_pipe("spacytextblob")
|
||||||
logger.info(f"{resource} is already installed.")
|
print("spaCy model and spacytextblob are ready.")
|
||||||
except Exception:
|
|
||||||
nltk.download(str(resource))
|
|
||||||
|
|
||||||
check_resources()
|
check_resources()
|
||||||
|
|
||||||
from nltk.sentiment.vader import SentimentIntensityAnalyzer
|
|
||||||
from nltk.tokenize import word_tokenize
|
|
||||||
|
|
||||||
# Initialize the sentiment analyzer
|
|
||||||
analyzer = SentimentIntensityAnalyzer()
|
|
||||||
|
|
||||||
def is_positive(sentence):
|
def is_positive(sentence):
|
||||||
"""
|
doc = nlp(sentence)
|
||||||
Determines if the sentiment of the sentence is positive.
|
sentiment_score = doc._.polarity # from spacytextblob
|
||||||
logger.infos debug information and returns True if sentiment score > 0.1.
|
|
||||||
"""
|
|
||||||
scores = analyzer.polarity_scores(sentence)
|
|
||||||
sentiment_score = scores['compound']
|
|
||||||
|
|
||||||
# logger.info debug message with sentiment score
|
|
||||||
debug_message = f"{DEBUG}{get_translation(LOCALE, 'sentence_positivity')} {sentiment_score}{RESET}"
|
debug_message = f"{DEBUG}{get_translation(LOCALE, 'sentence_positivity')} {sentiment_score}{RESET}"
|
||||||
logger.info(debug_message)
|
print(debug_message)
|
||||||
|
|
||||||
return sentiment_score > 0.1
|
return sentiment_score > 0.1
|
||||||
|
|
||||||
async def send_message(ctx, message=None, embed=None, file=None, edit=False, message_reference=None):
|
async def send_message(ctx, message=None, embed=None, file=None, edit=False, message_reference=None):
|
||||||
"""
|
|
||||||
Sends or edits a message in a Discord context.
|
|
||||||
Handles both slash command and regular command contexts.
|
|
||||||
"""
|
|
||||||
if edit and message_reference:
|
if edit and message_reference:
|
||||||
try:
|
try:
|
||||||
# Editing the existing message
|
|
||||||
await message_reference.edit(content=message, embed=embed)
|
await message_reference.edit(content=message, embed=embed)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
await ctx.send(f"{RED}{get_translation(LOCALE, 'edit_fail')} {e}{RESET}")
|
await ctx.send(f"{RED}{get_translation(LOCALE, 'edit_fail')} {e}{RESET}")
|
||||||
else:
|
else:
|
||||||
if hasattr(ctx, "respond"):
|
if hasattr(ctx, "respond"):
|
||||||
# For slash command contexts
|
|
||||||
sent_message = None
|
sent_message = None
|
||||||
if embed:
|
if embed:
|
||||||
sent_message = await ctx.respond(embed=embed, ephemeral=False)
|
sent_message = await ctx.respond(embed=embed, ephemeral=False)
|
||||||
|
@ -63,7 +47,6 @@ async def send_message(ctx, message=None, embed=None, file=None, edit=False, mes
|
||||||
if file:
|
if file:
|
||||||
sent_message = await ctx.respond(file=file, ephemeral=False)
|
sent_message = await ctx.respond(file=file, ephemeral=False)
|
||||||
else:
|
else:
|
||||||
# For regular command contexts
|
|
||||||
sent_message = None
|
sent_message = None
|
||||||
if embed:
|
if embed:
|
||||||
sent_message = await ctx.send(embed=embed)
|
sent_message = await ctx.send(embed=embed)
|
||||||
|
@ -74,34 +57,19 @@ async def send_message(ctx, message=None, embed=None, file=None, edit=False, mes
|
||||||
return sent_message
|
return sent_message
|
||||||
|
|
||||||
def append_mentions_to_18digit_integer(message):
|
def append_mentions_to_18digit_integer(message):
|
||||||
"""
|
|
||||||
Removes 18-digit integers from the message (commonly used for Discord user IDs).
|
|
||||||
"""
|
|
||||||
pattern = r'\b\d{18}\b'
|
pattern = r'\b\d{18}\b'
|
||||||
return re.sub(pattern, lambda match: f"", message)
|
return re.sub(pattern, lambda match: "", message)
|
||||||
|
|
||||||
def preprocess_message(message):
|
def preprocess_message(message):
|
||||||
"""
|
|
||||||
Preprocesses the message by removing 18-digit integers and non-alphanumeric tokens.
|
|
||||||
Returns the cleaned message as a string.
|
|
||||||
"""
|
|
||||||
message = append_mentions_to_18digit_integer(message)
|
message = append_mentions_to_18digit_integer(message)
|
||||||
tokens = word_tokenize(message)
|
doc = nlp(message)
|
||||||
tokens = [token for token in tokens if token.isalnum()]
|
tokens = [token.text for token in doc if token.is_alpha or token.is_digit]
|
||||||
return " ".join(tokens)
|
return " ".join(tokens)
|
||||||
|
|
||||||
def improve_sentence_coherence(sentence):
|
def improve_sentence_coherence(sentence):
|
||||||
"""
|
return re.sub(r'\bi\b', 'I', sentence)
|
||||||
Improves sentence coherence by capitalizing isolated 'i' pronouns.
|
|
||||||
"""
|
|
||||||
sentence = sentence.replace(" i ", " I ")
|
|
||||||
return sentence
|
|
||||||
|
|
||||||
def rephrase_for_coherence(sentence):
|
def rephrase_for_coherence(sentence):
|
||||||
"""
|
|
||||||
Rephrases the sentence for coherence by joining words with spaces.
|
|
||||||
(Currently a placeholder function.)
|
|
||||||
"""
|
|
||||||
words = sentence.split()
|
words = sentence.split()
|
||||||
coherent_sentence = " ".join(words)
|
coherent_sentence = " ".join(words)
|
||||||
return coherent_sentence
|
return coherent_sentence
|
|
@ -1,6 +1,7 @@
|
||||||
discord.py
|
discord.py
|
||||||
markovify
|
markovify
|
||||||
nltk
|
spacy
|
||||||
|
spacytextblob
|
||||||
requests
|
requests
|
||||||
psutil
|
psutil
|
||||||
better_profanity
|
better_profanity
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue