Tag Archives: check

LLM word checker

The recent Science Advance paper by Kobak et al. studied

vocabulary changes in more than 15 million biomedical abstracts from 2010 to 2024 indexed by PubMed and show how the appearance of LLMs led to an abrupt increase in the frequency of certain style words. This excess word analysis suggests that at least 13.5% of 2024 abstracts were processed with LLMs.

Although they say that the analysis was performed on the corpus level and cannot identify individual texts that may have been processed by a LLM, we can of course check the proportion of LLM words in a text. Unfortunately their online list contains stop words that I am eliminating here.

# based on https://github.com/berenslab/llm-excess-vocab/tree/main

import csv
import re
import os
from collections import Counter
from striprtf.striprtf import rtf_to_text
from nltk.corpus import stopwords
import nltk
import chardet

# Ensure stopwords are available
nltk.download('stopwords')

# Paths
rtfd_folder_path = '/Users/x/Desktop/mss_image.rtfd' # RTFD is a directory
rtf_file_path = os.path.join(rtfd_folder_path, 'TXT.rtf') # or 'index.rtf'
csv_file_path = '/Users/x/Desktop/excess_words.csv'

# Read and decode the RTF file
with open(rtf_file_path, 'rb') as f:
raw_data = f.read()

# Try decoding automatically
encoding = chardet.detect(raw_data)['encoding']
rtf_content = raw_data.decode(encoding)
plain_text = rtf_to_text(rtf_content)

# Normalize and tokenize text
words_in_text = re.findall(r'\b\w+\b', plain_text.lower())

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words_in_text if word not in stop_words]

# Load excess words from CSV
with open(csv_file_path, 'r', encoding='utf-8') as csv_file:
reader = csv.reader(csv_file)
excess_words = {row[0].strip().lower() for row in reader if row}

# Count excess words in filtered text
excess_word_counts = Counter(word for word in filtered_words if word in excess_words)

# Calculate proportion
total_words = len(filtered_words)
total_excess = sum(excess_word_counts.values())
proportion = total_excess / total_words if total_words > 0 else 0

# Output
print("\nExcess Words Found (Sorted by Frequency):")
for word, count in excess_word_counts.most_common():
print(f"{word}: {count}")

print(f"\nTotal words (without stopwords): {total_words}")
print(f"Total excess words: {total_excess}")
print(f"Proportion of excess words: {proportion:.4f}")
CC-BY-NC