CoNLL-2023 Preprocessing
Tutorial on how to preprocess the CoNLL-2003 Dataset for Named-Entity Recognition (NER).
This dataset should contain:
- The raw text files
- The annotations minimal infos:
- The start and end of the annotation (character indexes in the raw text)
- The label of the annotation (type of entity)
Python Code to Preprocess the CoNLL-2003 NER Dataset
#%%
import os
import urllib.request
import zipfile
import pandas as pd
from collections import Counter
from tqdm.auto import tqdm
from propp_fr import save_text_file, save_entities_df
# URL of the dataset
url = "https://data.deepai.org/conll2003.zip"
# Destination paths
data_dir = "datasets/conll2003"
zip_path = os.path.join(data_dir, "conll2003.zip")
# Create directory if it doesn't exist
os.makedirs(data_dir, exist_ok=True)
# Download the zip file
print("Downloading dataset...")
urllib.request.urlretrieve(url, zip_path)
print("Download complete!")
# Unzip
print("Extracting files...")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(data_dir)
print("Extraction complete!")
#%%
def generate_split_tokens_df(split_content):
doc_ID = -1
paragraph_ID = 0
tokens_df = []
for row in split_content:
row = row.strip()
if row == '-DOCSTART- -X- -X- O':
doc_ID += 1
token_ID = 0
paragraph_ID = -1
elif row == "":
paragraph_ID += 1
elif row != "":
word, POS, syntactic, BIO = row.split(" ")
tokens_df.append({"token_ID":token_ID,
"doc_ID":doc_ID,
"paragraph_ID":paragraph_ID,
"word":word,
"POS":POS,
"syntactic":syntactic,
"BIO":BIO})
token_ID += 1
tokens_df = pd.DataFrame(tokens_df)
return tokens_df
def extract_conll2003_entities_df(tokens_df):
entities = []
entity = None
for token_ID, BIO in enumerate(tokens_df["BIO"]):
if BIO == "O":
if entity is not None:
entity["end"] = token_ID - 1
entity["len"] = entity["end"] - entity["start"] + 1
entities.append(entity)
entity = None
continue
tag, cat = BIO.split("-", 1)
if tag == "B":
if entity is not None:
entity["end"] = token_ID - 1
entity["len"] = entity["end"] - entity["start"] + 1
entities.append(entity)
entity = {"cat": cat, "start": token_ID}
elif tag == "I":
if entity is None or entity["cat"] != cat:
# illegal I after O or type mismatch → treat as B
if entity is not None:
entity["end"] = token_ID - 1
entity["len"] = entity["end"] - entity["start"] + 1
entities.append(entity)
entity = {"cat": cat, "start": token_ID}
# else: valid continuation → do nothing
# close entity at end of document
if entity is not None:
entity["end"] = len(tokens_df) - 1
entity["len"] = entity["end"] - entity["start"] + 1
entities.append(entity)
entities_df = pd.DataFrame(entities)
return entities_df
def align_entities_byte_offsets(tokens_df, entities_df):
tokens = []
reconstructed_text = ""
token_ID = 0
for paragraph_ID, paragraph_tokens_df in tokens_df.groupby("paragraph_ID"):
for token_ID_within_sentence, token in enumerate(paragraph_tokens_df["word"].tolist()):
byte_onset = len(reconstructed_text)
reconstructed_text += str(token)
byte_offset = len(reconstructed_text)
if token_ID_within_sentence != len(paragraph_tokens_df) - 1:
reconstructed_text += " "
tokens.append({
"paragraph_ID": paragraph_ID,
"token_ID_within_sentence":token_ID_within_sentence,
"token_ID":token_ID,
"byte_onset": byte_onset,
"byte_offset": byte_offset,
})
token_ID += 1
if paragraph_ID != len(tokens_df["paragraph_ID"].unique()) - 1:
reconstructed_text += "\n"
tokens_df = pd.DataFrame(tokens)
# print(tokens)
# Map mention token spans to byte offsets
df = pd.merge(entities_df,
tokens_df[["token_ID", "byte_onset"]],
left_on="start",
right_on="token_ID",
how="left"
).drop(columns=["token_ID"])
df = pd.merge(df,
tokens_df[["token_ID", "byte_offset"]],
left_on="end",
right_on="token_ID",
how="left"
).drop(columns=["token_ID"])
entities_df = df.copy()
return reconstructed_text, entities_df, tokens_df
def realign_tokens_offsets(tokens_df, entities_df):
start_tokens = []
end_tokens = []
new_byte_onsets = []
new_byte_offsets = []
for mention_byte_onset, mention_byte_offset in entities_df[["byte_onset", "byte_offset"]].values:
start_token = tokens_df[tokens_df["byte_offset"] > mention_byte_onset].index.min()
end_token = tokens_df[tokens_df["byte_onset"] < mention_byte_offset].index.max()
new_byte_onsets.append(tokens_df.loc[start_token, "byte_onset"])
new_byte_offsets.append(tokens_df.loc[end_token, "byte_offset"])
start_tokens.append(start_token)
end_tokens.append(end_token)
entities_df["start_token"] = start_tokens
entities_df["end_token"] = end_tokens
entities_df["byte_onset"] = new_byte_onsets
entities_df["byte_offset"] = new_byte_offsets
return entities_df
def extract_mention_text(text_content, entities_df):
mention_texts = []
for mention_byte_onset, mention_byte_offset in entities_df[["byte_onset", "byte_offset"]].values:
mention_texts.append(text_content[mention_byte_onset:mention_byte_offset])
entities_df["text"] = mention_texts
entities_df["text"] = entities_df["text"].astype(str)
return entities_df
#%%
import json
files_directory = "datasets/conll2003/conll2003-NER_propp_minimal_implementation"
split_config = {}
for split_name in ["train", "valid", "test"]:
split_config[split_name] = []
print(f"Split: {split_name}")
with open(os.path.join(data_dir, f"{split_name}.txt"), "r") as f:
split_content = f.readlines()
tokens_df = generate_split_tokens_df(split_content)
for doc_id, doc_tokens_df in tqdm(tokens_df.groupby("doc_ID")):
doc_entities_df = extract_conll2003_entities_df(doc_tokens_df)
reconstructed_text, doc_entities_df, doc_tokens_df = align_entities_byte_offsets(doc_tokens_df, doc_entities_df)
doc_entities_df = doc_entities_df[["cat", "byte_onset", "byte_offset"]]
file_name = f"{split_name}_{doc_id}"
save_text_file(reconstructed_text, file_name, files_directory)
save_entities_df(doc_entities_df, file_name, files_directory)
split_config[split_name].append(file_name)
json.dump(split_config, open(os.path.join(files_directory, "split_config.json"), "w"))
import shutil
# Create a ZIP archive
shutil.make_archive(files_directory, 'zip', root_dir=files_directory)
print(f"Archive created: {files_directory}.zip")
#%%
This archive is ready to use with the Propp pipeline.
Alternatively, the dataset archive can be downloaded directly from the datasets section.