Processing all .txt Files in a Directory
This code will process all the .txt files in a files_directory and saves the results (tokens_df, entities_df, and characters_dict).
You can copy / paste the whole Notebook Code
from propp_fr import load_models, load_text_file, generate_tokens_df, load_tokenizer_and_embedding_model, get_embedding_tensor_from_tokens_df, generate_entities_df, add_features_to_entities, perform_coreference, extract_attributes, generate_characters_dict, save_tokens_df, save_entities_df, save_book_file
from pathlib import Path
from tqdm.auto import tqdm
files_directory = #"<directory_containing_txt_files>"
txt_files = sorted(p.stem for p in Path(files_directory).iterdir() if p.suffix == ".txt")
book_files = sorted(p.stem for p in Path(files_directory).iterdir() if p.suffix == ".book")
unprocessed_files = [file for file in txt_files if file not in book_files]
print(f"Unprocessed Files: {len(unprocessed_files):,}")
spacy_model, mentions_detection_model, coreference_resolution_model = load_models()
tokenizer, embedding_model = load_tokenizer_and_embedding_model(mentions_detection_model["base_model_name"])
for file_name in tqdm(unprocessed_files, desc="Processing .txt Files"):
print(f"Processing: {file_name}...")
text_content = load_text_file(file_name, files_directory)
tokens_df = generate_tokens_df(text_content, spacy_model)
tokens_embedding_tensor = get_embedding_tensor_from_tokens_df(
text_content,
tokens_df,
tokenizer,
embedding_model
)
entities_df = generate_entities_df(
tokens_df,
tokens_embedding_tensor,
mentions_detection_model,
)
entities_df = add_features_to_entities(entities_df, tokens_df)
entities_df = perform_coreference(
entities_df,
tokens_embedding_tensor,
coreference_resolution_model,
propagate_coref=True,
rule_based_postprocess=False,
)
tokens_df = extract_attributes(entities_df, tokens_df)
characters_dict = generate_characters_dict(tokens_df, entities_df)
save_tokens_df(tokens_df, file_name, files_directory)
save_entities_df(entities_df, file_name, files_directory)
save_book_file(characters_dict, file_name, files_directory)