% This must be in the first 5 lines to tell arXiv to use pdfLaTeX, which is strongly recommended.
\pdfoutput=1
% In particular, the hyperref package requires pdfLaTeX in order to break URLs across lines.

\documentclass[11pt]{article}

% Remove the "review" option to generate the final version.
\usepackage{acl}

% Standard package includes
\usepackage{times}
\usepackage{latexsym}

% For proper rendering and hyphenation of words containing Latin characters (including in bib files)
\usepackage[T1]{fontenc}
% For Vietnamese characters
% \usepackage[T5]{fontenc}
% See https://www.latex-project.org/help/documentation/encguide.pdf for other character sets

% This assumes your files are encoded as UTF8
\usepackage[utf8]{inputenc}

% This is not strictly necessary, and may be commented out,
% but it will improve the layout of the manuscript,
% and will typically save some space.
\usepackage{microtype}

% This is also not strictly necessary, and may be commented out.
% However, it will improve the aesthetics of text in
% the typewriter font.
\usepackage{inconsolata}
\usepackage{tabularx}
\usepackage{multirow}
\usepackage{enumitem}
\setlist[itemize]{noitemsep}
\setlist[enumerate]{noitemsep}
\usepackage{graphicx}

% If the title and author information does not fit in the area allocated, uncomment the following
%
%\setlength\titlebox{<dim>}
%
% and set <dim> to something 5cm or larger.

\title{Creating Digital Learning and Reference Resources for Southern Michif}

% Author information can be set in various styles:
% For several authors from the same institution:
% \author{Author 1 \and ... \and Author n \\
%         Address line \\ ... \\ Address line}
% if the names do not fit well on one line use
%         Author 1 \\ {\bf Author 2} \\ ... \\ {\bf Author n} \\
% For authors from different institutions:
% \author{Author 1 \\ Address line \\  ... \\ Address line
%         \And  ... \And
%         Author n \\ Address line \\ ... \\ Address line}
% To start a separate ``row'' of authors use \AND, as in
% \author{Author 1 \\ Address line \\  ... \\ Address line
%         \AND
%         Author 2 \\ Address line \\ ... \\ Address line \And
%         Author 3 \\ Address line \\ ... \\ Address line}

\author{Heather Souter, Olivia Sammons \\
  Prairies to Woodlands\\
  Indigenous Revitialization Circle \\
  \texttt{\{hsouter,osammons\}@p2wilr.org} \\
  \And
  David Huggins-Daines \\
  Independent Researcher \\
  \texttt{dhd@ecolingui.ca} 
  }

\begin{document}
\maketitle
\begin{abstract}
 Minority and Indigenous languages are often under-documented and under-resourced. Where such resources do exist, particularly in the form of legacy materials, they are often inaccessible to learners and educators involved in revitalization efforts, whether due to the limitations of their original formats or the structure of their contents.  Digitizing such resources and making them available on a variety of platforms is one step in overcoming these barriers. This is a major undertaking which requires significant expertise at the intersection of documentary linguistics, computational linguistics, and software development, and must be done while walking alongside speakers and language specialists in the community.  We discuss the particular strategies and challenges involved in the development of one such resource, and make recommendations for future projects with a similar goal of mobilizing legacy language resources.
 
\end{abstract}

\section{Introduction}
\label{sec:intro}

Michif, ma-laañg-inaan, katawashishin\footnote{Michif, our language, is beautiful.}
(Heather Souter).  Southern Michif (ISO 639-3: {\tt crg}; hereafter "Michif"), is one of three language varieties spoken by the M{\'e}tis~\cite{aarhus1997language, sammons2019}. It is a contact language combining elements from Algonquian languages—Plains Cree and the Saulteaux dialect of Ojibwe—with M{\'e}tis French. Michif has traditionally been spoken in small, diasporic communities across western Canada and the northern United States, mainly on the Prairies. Reliable census data regarding the current number of Michif speakers are unavailable, largely due to ambiguity around the use of the label "Michif", However, Southern Michif speakers and community members who are actively involved in community-based language revitalization informally estimate that there are likely fewer than 100 speakers today \cite{chew_creating_2023}. Intergenerational transmission of the language has ceased, and all but one or two mother-tongue speakers are over 70 years of age. Despite growing revitalization activities in M{\'e}tis communities in western Canada, few print and digital resources based on best practices in lexicography, language documentation, and second language acquisition are available to support those efforts.

The primary aim of this project was to digitize and make accessible an
out-of-print Michif dictionary~\cite{crawford}, while also developing
local capacity in technologies for Indigenous language documentation
and revitalization. With the assistance of Michif
first-language speakers, community-based language workers, project
partners, and computational linguists, we have developed the
Michif Talking Dictionary,\footnote{\url{https://dictionary.michif.org/}} a digital spoken version of this important
print resource. This dictionary is now available as a progressive web
application, adapted to a wide variety of screen sizes, as shown in
Figure~\ref{fig:screenshot}.  The application does not require
an Internet connection to search and browse once accessed.  Its source code, along
with the code used to process the text and annotated speech data
for the dictionary, is publicly available under an open-source
license.\footnote{\url{https://github.com/p2wilrc/mtd-michif/}}

\begin{figure}[ht]
  \centering
  \includegraphics[width=0.9\linewidth]{IMG_8043}
  \caption{Mobile dictionary on iPhone SE}
  \label{fig:screenshot}
\end{figure}

Another major goal of this project was to develop capacity through the
training of emerging Métis community linguists, language workers, and
scholars in the areas of audio recording, application of speech
technologies, and annotation.  Between September 2019 and May 2021,
one workshop on recording and five workshops on annotation were held
in Brandon, Manitoba, Ottawa, Ontario, and online via
Zoom.\footnote{After the outbreak of COVID-19 and resulting
  restrictions on travel and gathering.}

The original book, {\it The Michif Dictionary: Turtle Mountain
  Chippewa Cree}, is recognized for its valuable contribution to Michif language documentation. However this now out-of-print resource is largely inaccessible to learners of Michif unless purchased used at a
high price, and is rarely available for purchase.  While other Michif dictionaries that include audio from native speakers have been published and made available in electronic format (e.g., ~\citealp{rosen2016, gdi2012}), both of which are based primarily on Michif as it is spoken in Manitoba and Saskatchewan), this dictionary is exceptional in its degree of coverage of lexical items and example sentences. In addition, many important linguistic studies of Michif (e.g.~\citealp{aarhus1997language}), as well as the lexical resources mentioned above, have relied to varying degrees on the contents of the original Turtle Mountain Dictionary as one of their primary sources. The Turtle Mountain Dictionary is also an important historical resource, as many Métis community members in Canada have kinship ties to Belcourt, ND, where the dictionary was created, and because it includes the speech of an under-represented dialect of Michif. For all of these reasons, multiple Elders and community members identified the creation of an electronic edition of this dictionary as a priority, as it is viewed as a resource that is much too valuable to remain inaccessible, but should rather be put into the hands of Michif language learners and educators.

Permission was granted by Turtle Mountain Community College, the
dictionary’s copyright holder, to the project team to create a digital
version of the dictionary for online, offline, and mobile use. This
“new” version retains all of the original content, but will also allow
for the inclusion audio recordings of headwords and example
sentences, as well as further enrichment in the eventual addition of alternate orthographies and grammatical information for lexical entries.

\section{Recording}
\label{sec:recording}

For the dictionary, 181 hours of high-quality audio recordings were
collected from four separate speakers. One speaker, Verna DeMontigny,
recorded the entire dictionary from cover to cover, while others
recorded selected portions of it. Thus, all entries have been recorded
by at least one speaker, with some entries being recorded by two or
more speakers.

\begin{table*}[htbp]
  \centering
  \begin{tabular}{|l|l|r|}
    \hline
    Speaker & Michif Variety & Hours Recorded \\
    \hline
Verna DeMontigny &
The Corner, Manitoba &
143h14m34.45 \\
Sandra R. Houle &
Belcourt, North Dakota &
12h40m47.14 \\
Albert Parisien &
Belcourt, North Dakota &
15h31m40.16 \\
Connie Henry &
Boggy Creek, Manitoba &
10h00m00.97 \\
\hline    
\multicolumn{2}{|r|}{TOTAL} &
            181h27m02.72 \\
    \hline
  \end{tabular}
  \caption{Dictionary recording hours by speaker}
  \label{tab:recordings}
\end{table*}

As shown in Table~\ref{tab:recordings}, multiple Michif varieties are
represented in these recordings. It was particularly important for the
Belcourt, ND variety to be represented here, as the original creators
of the dictionary spoke this variety.

All the recordings, backed up regularly on multiple hard drives and on
Dropbox, were named according to a consistent file-naming process.
Metadata for each session, such as speaker name, location, and covered
pages of the dictionary, was tracked and shared among team members via a Google
spreadsheet.  As we will discuss below, the management of metadata was
one of several challenges we faced in the production of the dictionary;
for example, the information in this spreadsheet ultimately diverged from that contained
in the annotation files.  In our discussion of these challenges we 
hope to identify pitfalls and propose solutions for other groups
involved in a similar endeavour.  In this case, in the absence of a
content management system for the recordings, this problem could have been partially
mitigated with the ``data validation'' feature,
similar to the use of controlled vocabularies in ELAN.

\section{Annotation}
\label{sec:annotation}

All audio recordings were annotated using
ELAN~\cite{wittenburg-etal-2006-elan} to produce time-aligned
transcripts. First, each recording was segmented into pause-delimited
utterances automatically using a Deep Neural Network (DNN) voice
activity detection service that was developed within the VESTA-ELAN
project by the Centre de Recherche Informatique de
Montréal~\cite{gupta2022crim}. This auto-segmentation saved an
immeasurable amount of time in the annotation process.

To support remote annotators with heterogeneous Internet connections and computer hardware, hosting of the annotations was switched to Google Drive from Dropbox. As per the requirements of earlier versions of ELAN, it was necessary to provide WAV files to visualize waveforms, which were critical for annotators to be able to see and correct the automatic segmentation. However, dissemination of the 'master' WAV files was a challenge, given their large size. To address this, we first down-sampled the original audio from 48 kHz, 24-bit WAV into two different formats: (1)  high-quality MP3 files (44.1kHz, 16-bit, 128kbps), which were used for playback; and (2) low-quality WAV files (8kHz, 8-bit), which were provided only for waveform visualization in ELAN, and were never used in playback.  This approach made it feasible to share the entire audio collection with annotators over a cloud-based service, enabling them to both listen to high-quality versions of the audio and to display the corresponding waveforms in ELAN. The master recordings were maintained separately and later used as the source of the audio that was included in the dictionary.

The paper dictionary was scanned and converted to text using the
Tesseract 4 optical character recognition engine.  An ELAN template
was created with tiers for English headwords, Michif definitions, and
example sentences, and these were then integrated from the OCR text of
the dictionary into these transcripts by a team of Indigenous and
non-Indigenous language workers who contributed to the project as paid
contract employees, volunteers, and, in one case, as a student in a
for-credit independent study course in applied linguistics.

In most cases, the speakers recorded multiple instances of each word
and example sentence.  The annotators were therefore instructed to
select the best recording for ``export'' to the talking dictionary.
Due to the slow and careful speaking style used, the
example sentences and definitions were frequently split into multiple
segments, which had to be reassembled in the construction of the
talking dictionary.  Annotators were also instructed to adjust the
boundaries of these segments to ensure that no words were cut off.  In
some cases, it was necessary to splice together different instances in
order to obtain an audio clip without false starts or
mispronunciations.

Because of the dialect variation which exists in Southern Michif, as
well as the fact that the recordings were made nearly 40 years after
the creation of the print dictionary, the speakers often diverge from
the original text, or in some cases, provide a corrected version of a
dictionary entry.  Annotators were thus instructed to flag partial
matches as well as novel forms.  In the initial version of the the
talking dictionary, we have attempted to remain faithful to the
original text as much as possible, with the exception of typos and
misspellings.  A revised version is in development which will
present these variant and corrected forms along with relevant
grammatical information.

Manual review and corrections of the text of the dictionary was
performed by 14 undergraduate students as part of a Community
Service-Learning project in an Indigenous Languages of Canada course
in winter 2021. Students in this course used Transkribus Lite, a
web-based interface to functions of the Transkribus transcription
platform \cite{kahle2017transkribus}, to identify and address errors in
the computer-readable text of the dictionary that were introduced by the
previously applied OCR methods (e.g., correcting misspelled words,
entering words or lines that were present on the page but missed by the
OCR software, etc.). Errors were found and corrected on a total of 1600
lines of text, or 8.5\% of the dictionary.  However, there remained a large
number of systematic OCR errors, such as ambiguity between {\tt l},
{\tt 1}, and {\tt I}, which were corrected semi-automatically in the
dictionary build.

Since different parts of the project were conducted simultaneously,
technical issues arose from the ordering of this work.  For instance,
the post-correction of the dictionary text took place
{\em after} the start of the annotation process, resulting in a divergence
between the text in the ELAN annotations and the text dictionary.  Likewise, while the
dictionary entries from the original OCR output were separated into
definitions and examples when creating the ELAN files, and sometimes
also corrected by the annotators afterwards, these modifications were
not synchronized or linked in any way to the dictionary text.
Because it was infeasible to correct these discrepancies manually,
it was necessary to develop a complex data extraction process using
heuristic matching algorithms to align dictionary text and annotations.

\section{Dictionary Construction}
\label{sec:conversion}

The electronic dictionary was produced using a customized version of the
MotherTongues~\cite{littell-etal-2017-waldayu} platform. This
well-documented open-source tool provides Web and mobile applications
with a flexible and configurable approximate search feature, shown in
Figure~\ref{fig:approx}, along with a tool to automate the conversion
of dictionaries from a variety of formats including spreadsheets, XML,
and JSON files.  Compared to tools such as FLEx~\cite{Beier2022}, it
supports a very restricted set of lexicographical data, but no such
data exists in the original dictionary in any case.  This is a common
situation for community-developed resources, and the relatively
lightweight nature of MotherTongues allows for the creation of
dictionaries with a minimum of technical expertise.  That said, the
absence of grammatical information in the Michif Talking Dictionary
limits its usefulness for language learners, and we hope to address
this in a subsequent revision.

\begin{figure}[htbp]
  \centering
  \includegraphics[width=0.9\linewidth]{approx}
  \caption{Approximate Search}
  \label{fig:approx}
\end{figure}

As detailed in sections \ref{sec:recording} and \ref{sec:annotation},
there were four separate sources of information used to produce the
talking dictionary:

\begin{enumerate}
\item The corrected OCR dictionary text.
\item The original recordings.
\item The metadata spreadsheet identifying the speaker, date and
  location of each recording along with the pages of the dictionary
  covered and any comments on audio quality.
\item The ELAN files containing speech segments and aligned lexical
  entries and examples for each recording.
\end{enumerate}

Unfortunately, the need to rapidly organize a distributed annotation
effort, turnover of key personnel, and other difficulties arising from
the COVID-19 pandemic led to widespread inconsistencies within and
between these data sources. The initial version of the talking dictionary
reflected these inconsistencies; the audio was widely misattributed,
mismatched with the text, and of poor quality as it was mistakenly
taken from the low-quality files used for visualization rather than the original
master recordings.  In the absence of a content management system
adapted to this task, it is imperative that the project manager work
in close collaboration with technical resources to identify and correct
these problems.  It would be useful to continuously
build and deploy the electronic version of the dictionary, and to track
any integration problems, from the beginning of the annotation process.

The first priority when building the dictionary was thus the
reconstruction of the metadata and retrieval of the original audio
files.  As well, while the post-correction of the OCR output resulted
in a fairly consistently formatted text faithful to the original print
version, the organization of the entries in this text created numerous
problems when converting them to a structured format for
presentation.  Among other things, this required the development of a
language identification system, detailed in
Section~\ref{sec:extraction}.

Finally, after extracting structured text from the dictionary entries,
a subsequent matching was performed against the ELAN annotations to
identify and extract the corresponding audio segments.  Because of the
divergence between original and post-corrected text, as well as the
fact that annotators frequently (but inconsistently) corrected the
text in the annotations, this required a multi-stage heuristic
strategy in order to maximise the audio coverage, detailed in
Section~\ref{sec:matching}.

Because of the extensive recording and annotation efforts detailed in
Section~\ref{sec:recording}, there are often recordings of multiple
speakers for both Michif definitions and example sentences.  This
level of complexity in the dictionary entries was not supported by the
current version of MotherTongues at the time.  We therefore extended both the
dictionary builder and the Web user interface to support it, using a
more flexible JSON-based input format.\footnote{Our modifications
will be included in the next release of MotherTongues but are also
available at \url{https://github.com/p2wilrc/mothertongues/}} 

In order to quantify progress in improving the conversion workflow,
100 random entries were sampled and manually converted to this format,
and the performance of the system evaluated using precision and recall
over definitions and examples.  Along with the audio coverage, this F1
score was also recorded and tracked for each weekly build of the
dictionary during the development process.

\subsection{Entry Extraction}
\label{sec:extraction}

The text of the Turtle Mountain Dictionary consists of 350 pages of
Michif lexical entries and example sentences, organized into 9,181
English headwords followed by one or more Michif definitions and
associated example sentences in English and Michif.  We use
``definitions'' to describe these because they are not necessarily
Michif lexical entries; in many cases, they give a {\em description} of
the English word rather than the actual term used in Michif. For example,
the definition for \textit{zucchini}, shown in Figure~\ref{fig:definitions},
literally means `a type of pumpkin',
while the example sentence simply uses {\it zucchini}.~\footnote{
{\it zucchini} is also commonly used in Qu{\'e}bec French
instead of the standard {\it courgette}.}  Likewise, the defintion of
{\it zinnia} literally means `flowers of all sorts of colours'.  No
lexical information such as part
of speech, verb class, order, or gender is provided in the original
text.

\begin{figure}[htbp]
  \centering
  \begin{tabularx}{0.9\linewidth}{|X|}
    \hline
    \begin{flushleft}
      {\tt zucchini—en sort di sitroouy; I like zucchini cooked any way. Niweehkishpwow zucchini pikou ishi ay-ishikeeshishoust.}
    \end{flushleft}
    \\
    \hline
    \begin{flushleft}
      {\tt zinnia—lee flueur tout sort di koulueur.}
    \end{flushleft}
    \\
    \hline
  \end{tabularx}
  \caption{Examples of descriptive definitions}
  \label{fig:definitions}
\end{figure}

\begin{figure}[htbp]
  \centering
  \begin{tabularx}{0.9\linewidth}{|X|}
    \hline
    \begin{flushleft}
      {\tt {\it reflect}—wawshaynikayw, wawshayshkoutayw, nanawkatawayistamihk,
        kanaw katawayhtem; {\it The mirror reflects the light. }
        Wawshaynikayw le meerway.
        Wawshayshkoutayw li meerway. {\it He’ll reflect on his past actions. }
        Kananawkatawayistam tawnshi aykitahkamikishit.
        Kanawkatawayhtem kawpaytootahk.}
    \end{flushleft}
    \\
    \hline
  \end{tabularx}
  \caption{Entry structure {\it(English in italic)}}
  \label{fig:dicttext}
\end{figure}

Though the text of the dictionary entries have a relatively consistent
structure, the English example sentences and their Michif translations
are not attached to the corresponding Michif definitions or
consistently ordered.  In general, they are organized in pairs of
English and Michif texts. However, these pairs may contain varying
numbers of sentences, which in turn may correspond to one or more
examples.  For example, in Figure~\ref{fig:dicttext}, there are four
Michif definitions and two English example sentences, each of which
has two different corresponding Michif examples.  The extraction
process must therefore:

\begin{enumerate}
\item Identify and separate the headword and the individual definitions.
\item Separate English and Michif example texts.
\item Create pairs of English and corresponding Michif examples.
\item Match Michif example texts to the corresponding definition words.
\end{enumerate}

In the majority of cases the dictionary text follows one of two
straightforward patterns; either English and Michif examples
alternate, or a single English example is followed by multiple Michif
example sentences, one for each definition.  In some cases, the
individual examples also consist of multiple sentences.

To split the text into sentences, we used the {\tt PySBD}
library~\cite{sadvilkar-neumann-2020-pysbd}, which required some
post-processing to compensate for inconsistencies in how punctuation
and abbreviations were used in the original dictionary.  The initial version
of the dictionary used the off-the-shelf {\tt langid.py}
library~\cite{lui-baldwin-2012-langid} to identify ``not English''
sentences as presumably Michif.  This performed poorly, because
obviously, Michif is not present in the {\tt langid.py} model, but
also because the orthography used in the Turtle Mountain Dictionary
was specifically designed to resemble English~\cite{crawford}.

Instead, we created a binary classifier for English versus Michif,
using {\tt fastText}~\cite{bojanowski-etal-2017-enriching} with 5-gram
subword features, making the assumption that the English headwords are
valid English and the Michif definitions are valid Michif.  We
manually created a development set consisting of 1250 Michif and and
1239 English example sentences to evaluate the performance of these
models, obtaining 99.4\% accuracy, compared to 84.3\% for the original
{\tt langid.py} based approach.  Because any error is unacceptable in
the final dictionary, we maintain a separate list of ``overrides'' to
correct any errors found in testing.  Likewise, we keep a list of
``uncorrectable'' dictionary entries with manually extracted
definitions and examples where the original text cannot be parsed.

Once the English and Michif sentences have been identified and pairs
of examples created, they are scored against the Michif definitions
using the minimum Levenshtein distance between the definition and any
subsequence of the example, with whitespace and punctuation removed.
In some rare cases, this leads to incorrect matches due to the fact
that the definitions are fully-inflected forms rather than lemmas and
may not match the forms used in the examples.  It may be useful to
implement and evaluate a lemmatizer to improve the example matching.

\subsection{Annotation Matching}
\label{sec:matching}

As mentioned in Section~\ref{sec:recording}, the original recordings
contain 181 hours of audio.  Of these, there are 105 hours of speech,
which were annotated to identify the 18 hours of speech corresponding
to the Michif dictionary entries and
examples.  This number is considerably smaller than the total amount
of speech, as all entries and examples were read multiple times, with
the best reading selected for the dictionary.  There are also numerous
discussions between the speaker and the linguist regarding the text.
After extracting the structure of the dictionary entries, we process
the annotation files using {\tt
  pympi-ling}~\cite{pympi-1.70}, collecting all the tiers for an
aligned annotation in a single ``Span'' and matching these spans to
entries in the dictionary.

To compensate for the variable correction  of OCR errors in  the ELAN
files, we perform  a severe normalization of the  text before matching
annotations,  collapsing various  ambiguous  characters or  sequences
(for  example, {\tt  w/vv},  {\tt  t/f}, as  well  as  the ones  noted
previously).  In addition, we neutralize common spelling variations in
the  Michif text  such as  {\tt ou/oo}.   In some  cases, the  text is
reduplicated in the annotations, so we check and repair this as well.

Finally, although we used a controlled vocabulary for the type of
annotations, the difference between definitions and examples is not at
all clear in the original dictionary, so they are often misannotated.
In the case where this misannotation is unambiguous, we were able to
repair these automatically with a Python script, but in some cases
this was not possible. For this reason, the matching algorithm
collects as many annotations as possible, matching on both the English
and Michif text, then ordering by match and annotation type as well as
normalized Levenshtein distance.

A significant challenge for the audio matching is reassembling the
multiple fragments of an example which were split by voice activity
detection.  Annotators were instructed to select only one instance of
any definition or example for a given speaker, and to use annotation
types for the subsequent fragments, but this is not done consistently.
In the case where an audio clip is to be spliced together from
multiple instances, the original fragments are sometimes out of order
in the recording, and while this is indicated by annotator notes, it
is done in free text rather than with a controlled vocabulary,
requiring heuristics and in many cases manual corrections to the
annotator notes in order to get the correct ordering in the output. We
discuss the detection and correction of these errors in the next
section.

\section{Verification and Re-Annotation}

In testing the talking dictionary, it became obvious that many audio
entries were incomplete or mismatched to the text.  Given the scale of
the recordings and annotations and the limited resources available, we
attempted to use force-alignment to detect these problems, similarly
to how the Festvox system~\cite{anumanchipalli2011festvox} excludes
incorrectly labeled prompts to avoid egregious errors in
unit-selection synthesis.  Of course, no pre-trained acoustic models
exist for Michif.  Using the ``universal'' grapheme-to-phoneme
technique from \citet{pine-etal-2022-gi22pi}, we create an approximate
phonetic transcription of the Michif text, then use the same alignment
technique as~\citet{littell-etal-2022-readalong} with a narrow beam
search, flagging examples that fail to align for review.  To
streamline the workflow, we collect the audio clips on an HTML page,
shown in Figure~\ref{fig:elanhtml}, which we package with the relevant
ELAN annotation files and preference ({\tt .pfsx}) files which direct
ELAN to open directly on the annotation to be reviewed.

\begin{figure}[htbp]
  \centering
  \includegraphics[width=0.9\linewidth]{elanhtml}
  \caption{HTML page for reannotation}
  \label{fig:elanhtml}
\end{figure}

Since false positives are not problematic (we can simply listen to
them to determine that they are correct), a weak alignment model of
this sort is quite effective, allowing us to detect and correct
several hundred annotations which could not be fixed by the automated
processes described in Section~\ref{sec:matching}, generally in cases
where one segment of an example that was split by VAD was not properly
labeled by the annotators.  An unintended side benefit of this
verification is that it gives us word-level time alignments for the
example sentences.  We therefore extended the MotherTongues system to
include a ``read-along'' style highlighting of each word when
listening to the examples in the talking dictionary, as shown in
Figure~\ref{fig:readalong}.

\begin{figure}[htbp]
  \centering
  \includegraphics[width=0.9\linewidth]{readalong}
  \caption{Read-along highlighting}
  \label{fig:readalong}
\end{figure}

\section{Conclusions}

Some of the technical difficulties we had to overcome in creating this
resource stem from organizational difficulties exacerbated by the
COVID-19 pandemic.  Others may simply be inherent to a large-scale, widely
distributed and heterogeneous data collection and annotation effort.
For future projects of this scale, it is crucial to endure that
metadata is continuously validated and to avoid, at all costs,
duplicating it across multiple
unsynchronized data sources. It is equally important to involve a
variety of perspectives in the design of the data collection and
processing workflows, including members of the speech community,
documentary and computational linguists, and to allow for iterative
improvements to these processes.

When structured data is created as part of the data
collection and annotation process, this data should be considered
authoritative and maintained as such.  If created from an unstructured
data source (such as the OCR output of the paper dictionary), there
should either be a robust process to pull changes and corrections from
this original data source into the structured data, or the original
unstructured data should be archived and left alone.  This may require
careful consideration of the dependencies between different steps
in the process to avoid duplicate or conflicting efforts.

Some of these issues could be avoided with sufficient and appropriate
tooling.  In particular, while ELAN is a robust and highly useful tool
for annotation, it is difficult to integrate with external sources of
metadata, distributed filesystems, or version control systems.  While
ELAN is highly extensible, with numerous third-party plug-ins and
add-ons, it inherently operates at a single-file level, making it
cumbersome to perform tasks involving individual annotations across a
large number of EAF files. This could potentially be achieved by
adding an API to ELAN which would allow it to be controlled by an
external content management system.

Overall, this project has resulted in a resource that will be of
long-term use in Michif language teaching, revitalization, and
study. The dictionary application is now not only accessible to a wide
range of users, but is also searchable, and the recorded Michif
pronunciations of the headwords and example sentences will be
extremely valuable for learners. Moreover, a total of 16 Métis team
members were trained in language documentation and Indigenous
language technologies, developing local capacity. In particular, the
annotators involved in this project developed technical skills while also
gaining valuable exposure to the Michif language. They will be able to
carry this experience and knowledge with them as they continue their
language journeys and contribute to future language revitalization
initiatives.

\makeatletter\ifacl@finalcopy
\section*{Acknowledgements}
This work would not have been possible without the participation of
Verna DeMontigny, who not only recorded every single page of the
dictionary, but also provided invaluable expertise as a native Michif
speaker, educator, and Elder.  Thank you also to the other Michif
speakers who lent their voices to the dictionary: Connie Henry,
Albert Parisien, Sr., and the late Sandra Houle.  This work would
equally have been impossible without the contribution of Turtle
Mountain Community College, who graciously gave permission to use the
original dictionary text.  Samantha Cornelius and Janelle Zazalak coordinated
the annotation effort, while
Jacob Collard and Fineen Davis implemented the initial processing of the
OCR text and annotations for MotherTongues.  Tiara Opissinow provided essential assistance with
project management and reannotation.  David Delorme-Forsman also assisted
the reannotation effort. Christopher Cox and students in the School of Linguistics and Language Studies at Carleton University contributed to the digitization and OCR correction processes, while Gilles Boulianne of the CRIM aided greatly with the ELAN-VESTA annotation process.  We are extremely grateful to the National Research Council for supporting the project, and especially to Roland Kuhn for his support, patience, and enthusiasm. Kihchi-marsii!
\fi\makeatother


% Entries for the entire Anthology, followed by custom entries
\bibliography{anthology,custom}

\end{document}
