Skip to content

LitBank Preprocessing

Tutorial on how to preprocess the LitBank Dataset for coreference resolution.

This dataset should contain:

  • The raw text files
  • The annotations minimal infos:
    • The start and end of the annotation (character indexes in the raw text)
    • The label of the annotation (type of entity)
    • The coreference chains ID

We first need to download the dataset from the github repository: https://github.com/dbamman/litbank/tree/master/coref/brat

Python Code to Download the Dataset
import os

local_dataset_path = "datasets/litbank"
os.makedirs(dataset_path, exist_ok=True)

# The specific GitHub folder you want
github_folder_path = "coref/brat"
repo_url = "https://github.com/dbamman/litbank.git"

# Run the git commands (note the exclamation mark !)
!cd {dataset_path} && git init
!cd {dataset_path} && git remote add origin -f {repo_url}
!cd {dataset_path} && git config core.sparseCheckout true
!cd {dataset_path} && echo "{github_folder_path}/*" >> .git/info/sparse-checkout
!cd {dataset_path} && git pull origin master
# Reorganize: move the brat directory to the root of dataset_path, then remove coref and .git
!cd {dataset_path} && mv coref/brat . && rm -rf coref && rm -rf .git

We can now take a look at the dataset:

Python Code
from pathlib import Path
from collections import Counter

raw_files_path = "datasets/litbank/tsv"

all_files = sorted([p for p in Path(raw_files_path).iterdir() if p.is_file()])

extensions = dict(Counter(p.suffix for p in files).most_common())
print(f"The dataset contains {len(all_files):,} files with the following extensions:")
for extension, count in extensions.items():
    print(f"\t'{extension}' = {count:,}")
The dataset contains 200 files with the following extensions:
    '.txt' = 100
    '.ann' = 100

While the .txt file contains the raw text, we need to take a closer look at the .ann file.

Python Code
file_name = all_files[0].stem

txt_files = [p for p in all_files if p.suffix == ".txt"]
with open(ann_files[0], "r", encoding="utf-8") as f:
        text = f.read()

ann_files = [p for p in all_files if p.suffix == ".ann"]
ann_df = pd.read_csv(ann_files[0], sep="\t", header=None)

print(ann_df.sample(5).to_markdown())
0 1 2 3 4 5 6 7 8
393 COREF T174 Ralph_Percy-0 nan nan nan nan nan nan
18 MENTION T27 15 45 15 46 my house FAC NOM
42 MENTION T65 70 4 70 4 wife PER NOM
335 COREF T135 settlers-7 nan nan nan nan nan nan
245 MENTION T254 51 0 51 0 I PER PRON

We see that the first column contains the annotation type, on which the content of other columns depends.

Python Code
type_to_description_mapping = {
    "MENTION": "Entity mention with span and semantic type",
    "COREF": "True coreference link between two mentions",
    "COP": "Copular relation between two mentions (not treated as coreference)",
    "APPOS": "Appositional relation between two mentions (not treated as coreference)",
}

row_types_df = pd.DataFrame(
        Counter(ann_df[0]).most_common(),
        columns=["Column '0' Type", "Count"]
    )
row_types_df["Row Type"] = row_types_df["Column '0' Type"].map(type_to_description_mapping)

print(row_types_df.to_markdown(index=False))
Column '0' Type Count Row Type
MENTION 316 Entity mention
COREF 307 True coreference link between two mentions
COP 6 Copular relation between two mentions (not treated as coreference)
APPOS 3 Appositional relation between two mentions (not treated as coreference)

In this case, only the MENTION and COREF annotations are relevant, as copulae and appositions are not considered strict coreference relations.

We merge MENTION and COREF annotations into a single table and assign unique coreference IDs to singleton mentions:

Python Code
# Filter mentions
mention_df = ann_df[ann_df[0]=="MENTION"].reset_index(drop=True)
mention_df = mention_df[[1,2,3,5,6,7,8]]
mention_df.columns = ["mention_ID","paragraph_ID","start_token_within_sentence","end_token_within_sentence","text","cat","prop"]
mention_df[["paragraph_ID","start_token_within_sentence","end_token_within_sentence"]] = mention_df[["paragraph_ID","start_token_within_sentence","end_token_within_sentence"]].astype(int)

# Filter coreference links
coref_df = ann_df[ann_df[0]=="COREF"].reset_index(drop=True)
coref_df = coref_df[[1,2]]
coref_df.columns = ["mention_ID","COREF_name"]

# Merge mentions with coreference info
entities_df = pd.merge(coref_df, mention_df, on="mention_ID", how="outer")\
                 .drop(columns=["mention_ID"])\
                 .sort_values(["paragraph_ID","start_token_within_sentence","end_token_within_sentence"])\
                 .reset_index(drop=True)

# Fill missing COREF_name with unique names
existing = set(entities_df["COREF_name"].dropna())
for idx, row in entities_df[entities_df["COREF_name"].isna()].iterrows():
    base = row["text"].replace(" ", "_")
    i = 1
    name = base
    while name in existing:
        i += 1
        name = f"{base}_{i}"
    entities_df.at[idx, "COREF_name"] = name
    existing.add(name)

We split the original text into paragraphs and tokens, compute tokens byte offsets, and directly merge them with the mentions to map token spans to text offsets.

Python Code
# Split text into tokens and track byte offsets
tokens = []
reconstructed_text = ""
for paragraph_ID, paragraph in enumerate(text_content.split("\n")):
    if paragraph_ID != 0:
        reconstructed_text += "\n"
    for token_ID_within_sentence, token in enumerate(paragraph.split(" ")):
        byte_onset = len(reconstructed_text)
        reconstructed_text += token
        byte_offset = len(reconstructed_text)
        if token_ID_within_sentence != len(paragraph.split(" ")) - 1:
            reconstructed_text += " "
        tokens.append({
            "paragraph_ID": paragraph_ID,
            "token_ID_within_sentence":token_ID_within_sentence,
            "byte_onset": byte_onset,
            "byte_offset": byte_offset,
        })

tokens_df = pd.DataFrame(tokens)

# Map mention token spans to byte offsets
df = pd.merge(entities_df,
              tokens_df[["paragraph_ID", "token_ID_within_sentence", "byte_onset"]],
              left_on=["paragraph_ID", "start_token_within_sentence"],
              right_on=["paragraph_ID", "token_ID_within_sentence"],
              ).drop(columns=["token_ID_within_sentence"])
df = pd.merge(df,
              tokens_df[["paragraph_ID", "token_ID_within_sentence","byte_offset"]],
              left_on=["paragraph_ID", "end_token_within_sentence"],
              right_on=["paragraph_ID", "token_ID_within_sentence"],
              ).drop(columns=["token_ID_within_sentence"])
entities_df = df.copy()

We now have two key objects for downstream coreference tasks:

  • reconstructed_text: the original text stripped of trailing spaces and formatted with simple \n between paragraphs.
  • entities_df: a pandas.dataframe containing all annotated mentions with byte offsets aligned to reconstructed_text and associated coreference information.

The minimal configuration of entities_df needed to train a coreference resolution pipeline is:

  • byte_onset: exact character index of the mention start in reconstructed_text,
  • byte_offset: exact character index of the mention end in reconstructed_text,
  • cat: the mention type (e.g. PER, FAC, GPE, ORG),
  • COREF_name: a unique identifier for the mention coreference chain.

In this case entities_df contains additional columns:

  • text: the surface form of the mention,
  • prop: the manually annotated grammatical type of the mention (proper, noun phrase, pronoun).
byte_onset byte_offset cat COREF_name text prop
13 21 FAC Chancery-0 Chancery PROP
22 28 GPE London-1 London PROP
69 84 PER Lord_Chancellor-2 Lord Chancellor PROP
96 115 FAC Lincoln__s_Inn_Hall-3 Lincoln 's Inn Hall PROP
163 174 FAC the_streets-4 the streets NOM

We can now save our formatted reconstructed_text and entities_df to our dataset directory:

Python Code
from propp_fr import save_text_file, save_entities_df

local_dataset_path = "datasets/litbank"
os.makedirs(local_dataset_path, exist_ok=True)

save_text_file(reconstructed_text, file_name, local_dataset_path)
save_entities_df(entities_df, file_name, local_dataset_path)

Finally, we can wrap the entire preprocessing pipeline into a loop to handle all 100 LitBank files.

Python Code
from tqdm.auto import tqdm
import pandas as pd
from pathlib import Path
import os
from propp_fr import save_text_file, save_entities_df

raw_files_directory_path = "datasets/litbank/tsv"
local_dataset_path = "datasets/litbank/propp_minimal"

file_names = sorted(list(set([p.stem for p in Path(raw_files_directory_path).iterdir() if p.is_file()])))

for file_name in tqdm(file_names):
    # Loading Input Files
    with open(os.path.join(raw_files_directory_path, file_name + ".txt"), "r", encoding="utf-8") as f:
        text_content = f.read()
    ann_df = pd.read_csv(os.path.join(raw_files_directory_path, file_name + ".ann"), sep="\t", header=None)

    # Filter mentions
    mention_df = ann_df[ann_df[0]=="MENTION"].reset_index(drop=True)
    mention_df = mention_df[[1,2,3,5,6,7,8]]
    mention_df.columns = ["mention_ID","paragraph_ID","start_token_within_sentence","end_token_within_sentence","text","cat","prop"]
    mention_df[["paragraph_ID","start_token_within_sentence","end_token_within_sentence"]] = mention_df[["paragraph_ID","start_token_within_sentence","end_token_within_sentence"]].astype(int)

    # Filter coreference links
    coref_df = ann_df[ann_df[0]=="COREF"].reset_index(drop=True)
    coref_df = coref_df[[1,2]]
    coref_df.columns = ["mention_ID","COREF_name"]

    # Merge mentions with coreference info
    entities_df = pd.merge(coref_df, mention_df, on="mention_ID", how="outer")\
                     .drop(columns=["mention_ID"])\
                     .sort_values(["paragraph_ID","start_token_within_sentence","end_token_within_sentence"])\
                     .reset_index(drop=True)

    # Fill missing COREF_name with unique names
    existing = set(entities_df["COREF_name"].dropna())
    for idx, row in entities_df[entities_df["COREF_name"].isna()].iterrows():
        base = row["text"].replace(" ", "_")
        i = 1
        name = base
        while name in existing:
            i += 1
            name = f"{base}_{i}"
        entities_df.at[idx, "COREF_name"] = name
        existing.add(name)

    # Split text into tokens and track byte offsets
    tokens = []
    reconstructed_text = ""
    for paragraph_ID, paragraph in enumerate(text_content.split("\n")):
        if paragraph_ID != 0:
            reconstructed_text += "\n"
        for token_ID_within_sentence, token in enumerate(paragraph.split(" ")):
            byte_onset = len(reconstructed_text)
            reconstructed_text += token
            byte_offset = len(reconstructed_text)
            if token_ID_within_sentence != len(paragraph.split(" ")) - 1:
                reconstructed_text += " "
            tokens.append({
                "paragraph_ID": paragraph_ID,
                "token_ID_within_sentence":token_ID_within_sentence,
                "byte_onset": byte_onset,
                "byte_offset": byte_offset,
            })

    tokens_df = pd.DataFrame(tokens)

    # Map mention token spans to byte offsets
    df = pd.merge(entities_df,
                  tokens_df[["paragraph_ID", "token_ID_within_sentence", "byte_onset"]],
                  left_on=["paragraph_ID", "start_token_within_sentence"],
                  right_on=["paragraph_ID", "token_ID_within_sentence"],
                  ).drop(columns=["token_ID_within_sentence"])
    df = pd.merge(df,
                  tokens_df[["paragraph_ID", "token_ID_within_sentence","byte_offset"]],
                  left_on=["paragraph_ID", "end_token_within_sentence"],
                  right_on=["paragraph_ID", "token_ID_within_sentence"],
                  ).drop(columns=["token_ID_within_sentence", "paragraph_ID", "start_token_within_sentence", "end_token_within_sentence"])
    entities_df = df.copy()

    minimal_columns = ['byte_onset', 'byte_offset', 'cat', 'COREF_name']
    entities_df = entities_df[minimal_columns + [col for col in df.columns if col not in minimal_columns]]

    file_name = file_name.replace("_brat", "") if file_name.endswith("_brat") else file_name
    save_text_file(reconstructed_text, file_name, local_dataset_path)
    save_entities_df(entities_df, file_name, local_dataset_path)

import requests

split_id = 0
split_config = {}

while True:
    url = f"https://raw.githubusercontent.com/dbamman/lrec2020-coref/master/data/litbank_tenfold_splits/{split_id}/test.ids"
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException:
        # No more splits (404 or network error)
        break

    content = response.text

    split_content = [
        file_name.replace("_brat.tsv", "")
        for file_name in content.split("\n")
        if file_name.endswith("_brat.tsv")
    ]

    split_config[f"test_{split_id}"] = split_content
    split_id += 1

import json
json.dump(split_config, open(os.path.join(local_dataset_path, "split_config.json"), "w"))

import shutil

output_archive_name = "litbank_propp_minimal_implementation"
# Path where the ZIP will be saved (same level as local_dataset_path)
output_path = os.path.join(os.path.dirname(local_dataset_path), output_archive_name)
# Create a ZIP archive
shutil.make_archive(output_path, 'zip', root_dir=local_dataset_path)
print(f"Archive created: {output_path}.zip")

Finally, the script generates a compressed ZIP archive of the processed dataset:

litbank_propp_minimal_implementation.zip

This archive is ready to use with the Propp pipeline.

Alternatively, the dataset archive can be downloaded directly from the datasets section.