% This must be in the first 5 lines to tell arXiv to use pdfLaTeX, which is strongly recommended.
%\pdfoutput=1
% In particular, the hyperref package requires pdfLaTeX in order to break URLs across lines.

\documentclass[11pt]{article}

% Remove the "review" option to generate the final version.
\usepackage{acl}

%\usepackage[draft]{todonotes}
\usepackage{booktabs}
% Standard package includes
\usepackage{times}
\usepackage{latexsym}

\usepackage{subcaption}
\usepackage{caption}
\usepackage{graphicx}
\usepackage[draft]{todonotes}

\newcommand{\review}[1]{\todo[inline,color=yellow]{Review: #1}}

% For proper rendering and hyphenation of words containing Latin characters (including in bib files)
\usepackage[T1]{fontenc}
% For Vietnamese characters
% \usepackage[T5]{fontenc}
% See https://www.latex-project.org/help/documentation/encguide.pdf for other character sets

% This assumes your files are encoded as UTF8
\usepackage[utf8]{inputenc}

% chinese chars
\usepackage{CJKutf8}

% examples
\usepackage{expex} % for example sentences
\newenvironment{example}{\ex}{\xe\ignorespacesafterend} % more latexy
\newenvironment{pexample}{\pex}{\xe\ignorespacesafterend} % more latexy

% This is not strictly necessary, and may be commented out,
% but it will improve the layout of the manuscript,
% and will typically save some space.
\usepackage{microtype}

% This is also not strictly necessary, and may be commented out.
% However, it will improve the aesthetics of text in
% the typewriter font.
\usepackage{inconsolata}

% If the title and author information does not fit in the area allocated, uncomment the following
%
%\setlength\titlebox{<dim>}
%
% and set <dim> to something 5cm or larger.

\title{Investigating Discourse Segmentation in Taiwan Southern Min Spontaneous Speech}

% Author information can be set in various styles:
% For several authors from the same institution:
% \author{Author 1 \and ... \and Author n \\
%         Address line \\ ... \\ Address line}
% if the names do not fit well on one line use
%         Author 1 \\ {\bf Author 2} \\ ... \\ {\bf Author n} \\
% For authors from different institutions:
% \author{Author 1 \\ Address line \\  ... \\ Address line
%         \And  ... \And
%         Author n \\ Address line \\ ... \\ Address line}
% To start a separate ``row'' of authors use \AND, as in
% \author{Author 1 \\ Address line \\  ... \\ Address line
%         \AND
%         Author 2 \\ Address line \\ ... \\ Address line \And
%         Author 3 \\ Address line \\ ... \\ Address line}

\author{Laurent Prévot$^{1,2}$ \\
  $^{1}$CNRS \& MEAE, CEFC \\
  $^{2}$CNRS \& Aix Marseille Université, LPL \\
  Taipei, Taiwan\\
  \texttt{laurent.prevot@cefc.com.hk} \\\And
  Sheng-Fu Wang \\
  Institute of Linguistics \\
  Academia Sinica \\
  Taipei, Taiwan \\
  \texttt{sftwang@gate.sinica.edu.tw} \\}

\begin{document}
\maketitle
\begin{abstract}
 In recent years, discourse segmentation has received increased attention; however the majority of studies have focused on written genres and languages with abundant linguistic resources. This paper investigates discourse segmentation of a spontaneous speech corpus in Taiwan Southern Min. We compare fine-tuning a Language Model (LLM) using two approaches: supervised, taking advantage of a high-quality annotated dataset, and weakly-supervised, which requires only a small amount of manual labeling. The corpus used here is transcribed in both Chinese characters and romanized script. This allows us to assess the impact of the written form on the discourse segmentation task. Moreover, the dataset includes manual prosodic break labeling, allowing an exploration of the role prosody can play in contemporary discourse segmentation systems grounded in LLMs. In our study, the supervised approach outperforms weak-supervision; the character-based version demonstrates better scores compared to the romanized version; and prosodic information proves to be an interesting source to increase discourse segmentation performance. 
\end{abstract}


\section{Introduction}
Discourse segmentation consists in breaking down texts
or conversations into functional units that better corresponds to participants' intentions than sentences or simple speech activity chunks. We will use the term discourse unit (DU) \cite{Asher2003} to designate a minimal speech act or communicative unit. Each DU corresponds roughly to a clause-level content that denotes a single fact or event. 

While the segmentation of discourse units (DUs) in written documents has received a lot of attention from the discourse and NLP community,  the same cannot be said for the segmentation of spontaneous speech. In this study, we approach the segmentation of discourse units in a corpus of spontaneous speech in Taiwan Southern Min.%


Southern Min is a sino-tibetan language spoken by over 50 million people, and includes Taiwan Southern Min, which is one of the official language of Taiwan. We take advantage here of an existing discourse segmented corpus of spoken interviews for running discourse segmentation experiments.

We develop DU segmenters based on different principles and evaluate their performance. More precisely, we compare fine-tuning an LLM with hand labeled data vs. employing a data programming approach \cite{Ratner2017} that requires only a fraction of annotated data. While fine-tuning LLMs for language well represented in the LLM training data proved to be a very efficient solution \cite{Gravellier2021,prevot-etal-2023-comparing}, it remains to be seen whether this approach is relevant for languages, particularly their spontaneous speech variants, less represented in the training data. Finally, we investigate the impact of using either romanization or Chinese characters in our dataset, as well as the potential contribution of prosody.

\section{Related Work}
\label{sec:soa}

In recent years, there has been a renewed interest in discourse parsing and discourse unit segmentation within the NLP community. As in other subdomains, Large Language Models have proven highly beneficial and allowed to reach unprecedented scores for these tasks. However, discourse segmentation within these deep learning approaches has been applied to only a few langauges, until the recent initiative of {\sc disrpt} campaigns started \cite{Zeldes2019,Zeldes2021,braud-etal-2023-disrpt}.
The work conduced within the framework of these campaigns has equipped the community with a set of powerful tools and frameworks to perform DU segmentation using these contemporary approaches.

As discussed in \citet{braud-etal-2023-disrpt}, even for written genres, discourse segmentation performance drops in languages other than English and when gold sentences are not given, due to sentence segmenters being far from perfect \cite{Braud2017}. Considering spontaneous conversational speech, the related tasks of dialogue-act segmentation and tagging yiels various interpretation regarding the definition of base units. For instance, some models explain that dialogue acts being multi-functional, several segmentations can be considered depending  on the aspects of dialogue being considered at the time of segmentation \cite{petukhova2011multi}.\\

A recent trend involves approaching discourse segmentation with sequential models over contextual embeddings \cite{wang-etal-2018-toward,Muller2019}. Turning specifically to spontaneous speech discourse segmentation, \cite{Gravellier2021} applied a weak-supervision approach \cite{Ratner2017} and reached an f-score of $73.7$ while having access to gold turn segmentation. More specifically, manual heuristic rules, including some rules exploiting the discourse segmentation model trained on a written dataset \cite{Muller2019}, were created to annotate noisily the entire dataset. This noisy data was then used to fine-tune an LLM, BERT \cite{Devlin2018} in that case. In \citet{prevot-etal-2023-comparing}, a larger amount of manual annotation allowed to compare fine-tuning with larger amount of training data and a weakly-supervised approach. For this French dataset, it was concluded that more than 7000 annotated DUs were required in the supervised training approach to beat the weakly-supervised approach (f-score: $70.6$). When more data was used, supervised fine-tuning reached slightly higher scores (f-score: $73.9$). These f-score results are $10-15\%$ than the scores obtained on written genress, which is expected as sentence splitters leveraging punctuation provide substantial assistance for discourse unit segmentation. In speech, particularly spontaneous interactional speech, pauses are useful but are by far less reliable in predicting discourse units since they are involved in many other dimensions and are subject to significant inter-individual variability. Recently \citet{metheniti-etal-2023-discut}\footnote{Code at \url{https://github.com/phimit/jiant/}}, an improvement over \citet{Muller2019} has been developed, allowing to reach new state-of-the-art results for discourse segmentation in various languages. Our paper reuses the technical framework of this paper.\\

Segmenting speech into Discourse and Prosodic units has been the focus of numerous studies across various languages, including high-resource languages like English \cite{hirschberg1992intonational,hirschberg1996prosodic}, Dutch \cite{swerts1997prosodic}, French or Mandarin \cite{degand2009identifying,Prevot2015} as well as low-resource languages \cite{mettouchi2021prosodic}. Discourse-prosodic interface research has also been developed for better understanding turn-taking mechanisms \cite{hu2023conversational,botinis2007mutlifactor}. The deep connection between discourse and prosody has led researchers to explore prosodic cues for discourse tasks with some success \cite{pierrehumbert1990meaning,shriberg2000prosody}. However, to our knowledge, there are no studies in which modern LLM-based systems described above, which achieve high scores based solely on transcripts, have benefited from incorporating acoustic-prosodic cues. An interesting attempt was made in \cite{Gravellier2021}, which validated the weak-supervision approach exploiting silent pauses among other elements, but the results did not improve with the inclusion of other acoustic-prosodic features. This is likely due to (i) the already high scores obtained from text alone, which would require cues coming from other sources to yield very high precision; and (ii) to the challgenge of automatic extracting reliable prosodic cues, such as speech rate, pitch or even intensity, from conversational speech.\\


Discourse Studies on Southern Min (and related language like Hakka or Cantonese) have focused on final particles \cite{lien1988taiwanese,li1999utterance,fung2000final,chappell2019southern}, which  can carry an interesting range or semantic and pragmatic functions. Moreover, there have been specific corpus studies examining discourse markers in Taiwan Southern Min \cite{chang2002discourse, chang2008discourse, chang2017corpus}. However, to the best of our knowledge, there has been no attempt to automatically segment discourse units in this language. 

Additionally, there have been specific corpus studies examining \cite{chang2002discourse, chang2008discourse, chang2017corpus}. However, to the best of our knowledge, there has been no attempt to automatically segment discourse units in this language.

\section{Dataset}%
\label{sec:data}
\subsection{Base data}
The discourse segmentation data used in this paper comes from an 8-hour corpus of monologue-like spontaneous speech elicited in sociolinguistic interviews as part of a larger project that collected Min-Mandarin bilingual speech recordings all over Taiwan between 2004 and 2010 \cite{wang2013taiwan,fon2004corpus}. This subset of the corpus, also used in phonetic studies on phenomena including pre-boundary lengthening \cite{wang2023tonesandhi, wang2022interaction, wang2012durational} and tone sandhi \cite{chen2018tone}, contained speech materials from 16 speakers, who each contributed around 30 minutes of recording. The speakers were evenly split in gender and two age groups (old and young). At the time of recording, the old speakers were between 50-65 years old, and the young speakers were between 20-35 years old. Due to the original recording setup, the transcripts only focused on speech from the interviewee, with the interviewer's turns being labeled with a `turn' token. The transcripts follow the convention used in a dictionary \footnote{https://sutian.moe.edu.tw/zh-hant/} administered by the Ministry of Education in Taiwan, along with a romanized version. The transcripts were aligned with the recordings at the syllable level using EasyAlign \cite{goldman2011easyalign} with manual corrections from a trained phonetician. During the manual correction process, pauses annotation was incorporated in the transcripts that are used in this study. In addition to pauses, the corpus also contains annotations on prosodic breaks, with a main goal of identifying the presence of two levels of breaks (intonational phrases and intermediate phrases), as well as breaks resulted in from hesitations and disfluencies. Data from two of the speakers were used to calculated cross-labeller agreement (kappa: $0.86$). We observe that although done completely independently discourse and prosodic units exhibit a relationship : 45\% of the prosodic breaks are also discourse breaks while 82\% of the discourse breaks also correspond to a prosodic break. \\  %(Precision: 85.6, Recall: 91.7, f-score: 88.6; kappa: 86.0)

Due to the lack of widely available text-processing tools in this language, dictionary-based method was used to perform word segmentation (maximal length matching) and POS tagging, the latter of which follows a multihot format, i.e., a word that is ambiguous between multiple POS tags according to the dictionary is annotated as `1' for all those tags. \\

The corpus contains 88.5K words at the word level with pause (\#) and specific interviewer turn symbols included. 

%\review{I suggest authors providing suitable definition and reference for romanized and character transcripts.}

%\review{It is not clear to me if the dataset used by authors is available. Please, share the github link (in case of available). Otherwise, explicitly inform about copyrights.}

%Size of Raw data (tokens, duration)\\

%Specificities : Spontaneous Speech, monologic,...\\

%Examples with transcripts and translation\\

%Discuss romanized vs. characters \\
%annotate

\subsection{Discourse Segmentation Annotation}
The corpus contains annotation of discourse units, which are defined as units that contain a verb and its core arguments, a criterion that is also used in other studies on the interaction between discourse and prosody (e.g., \cite{chen2019prosodic, Prevot2015}). Crucially, discourse annotation in this corpus was performed independently from the recordings, i.e., the annotators only saw the transcripts, with turn information but no precise timing information, when they performed the task. Similarly to the prosodic labeling, two annotators labeled transcripts from two of the speakers for examination of interlabeller agreement (kappa: $0.96$), and one annotator labeled the remaining transcripts. See Table \ref{du_examples} for examples of discourse units. 

%\review{According to authors, the corpus was annotated using discourse units, which they define as units that contain a verb and its core arguments. However, the authors did not present their motivations for that or any literature work that followed this definition. Finally, I suggest authors provide, besides references, clear examples of this annotation.}

%\review{The corpus was annotated by two annotators and the authors report a high kappa 96.2. The authors did not present any information about the annotators. Are they linguists? What kind of expertises are they? Could authors discuss the hypothesis for high obtained kappa? I also suggest the authors present the Kappa matrix. See example in https://aclanthology.org/2023.ranlp-1.127.pdf. Finally, it is not clear how prosodic was annotated.}

\begin{CJK*}{UTF8}{bsmi}
\begin{table*}[htb!] 

	\begin{tabular}{cccc}
		\hline
		char:	& [其實\hspace{0.2cm}\#\hspace{0.2cm}我\hspace{0.2cm}相信]  & [別人\hspace{0.2cm}會使] \# & [咱\hspace{0.2cm}就\hspace{0.2cm}一定\hspace{0.2cm}會使] \\
		roman:	& [ki5-sit8 \# goa2 siong-sin3] & [pat-lang5 e7-sai2] & [lan2 to it-teng7 e7-sai2] \\
		gloss: & [actually (pause) I believe] & [others can] & [we PART must can] \\
		trans: & [`actually I believe'] & [`(if) others can (do it)'] & [`we must be able to (do it as well)'] \\
\hline
	\end{tabular}
 	\caption{Examples of three discourse units. Note how the pause (\#) may occur within a discourse unit}
    \label{du_examples}
\end{table*}
\end{CJK*}

Disfluencies were not segmented apart and were instead included within discourse units. Discourse labellers had access to gold turn segmentation but were not told to use them systematically. As a result a few discourse units manually labeled span over more than one turn.\\ %discourse unit. \\

\begin{figure}
\centering
\includegraphics[width=0.45\textwidth]{pics/du_sizes.png}%
\caption{DU lengths in tokens.}
\label{fig:du_lengths}
\end{figure}

Taking a more quantitative perspective, the distribution of the annotated discourse units lengths in terms of tokens is provided in \ref{fig:du_lengths}. We can see a fairly balanced distribution of lengths that are shorter than 10 tokens with a mean of $7.5$ tokens per discourse unit. Truly conversational corpora tend to present a different bimodal distribution with a mode of very short units (made of 1 token) corresponding to feedback and back-channels and a second mode of units made of 4-6 tokens. The dataset here is a corpus of interviews for which only the interviewee is transcribed. While being truly spontaneous, this explains why there are less extremely short interactional units as well why the mode of the distribution includes longer lengths than purely dialogic genres.

\section{Methodology / Experiments}
\label{sec:method}

The corpus includes interviews of 16 speakers. We made 8 folds composed of two speakers each and ran a cross-validation over the 8 folds with different test / dev / train splits. Given our corpus, this is a method that maximizes the distance between training and testing data.\\ 
%In the weak supervision experiment one fold was kept for {\tt dev} and one for {\tt test} while the 6 others were for the training. In the supervised experiment, we simply.

%The methodology used in this paper follows a series of paper introduced in \ref{sec:soa}. 
Two main approaches are evaluated for segmenting automatically our dataset : (i) directly fine-tuning a LLMs with all the data at our disposal (in a supervised way) ({\it Supervised} setting), (ii) create a noisily annotated datasets thanks to manual heuristic rules (See Figure \ref{fig:lf}) and a model to combine them.\\

More specifically, we used {\sc Roberta} \cite{liu2019roberta} and the framework fine-tuning it was {\sc Discut} \cite{metheniti-etal-2023-discut}, grounded in {\sc jiant} environment \cite{pruksachatkun-etal-2020-jiant}. \\

The weak-supervision framework uses {\sc skweak} \cite{Lison2021} rather than {\sc Snorkel} \cite{Ratner2017}. {\sc skweak} natively allows the model to exploit the sequential nature of our task. On the technical side, {\sc skweak} relies on {\sc Spacy} \cite{Honnibal2017} documents. In order to keep all the relevant information (timing, pos-tags, prosody labels) linked to the tokens and to use them in the labeling rules, we made use of {\sc spacy} extensions attributes. %Compared to {\sc snorkel} framework {\sc skweak} data model grounded on {\sc spacy} provides a naturalistic framework for dealing with documents as token sequences.

\begin{figure*}[htb!]
\begin{verbatim}
def pause_and_begin_char(doc):
    for idx, token in enumerate(doc):
        if idx > 0:
            if (doc[idx-1].text == '#') and (doc[idx-1]._.dur > PAUSE) 
                    and (doc[idx].text in BEGIN_CHAR):
                yield idx,idx+1,'BDU'
            else:
                yield idx,idx+1,'ABS'
        else:
            yield idx,idx+1,'BDU'
\end{verbatim}
\caption{Labelling Function example (pause combined with a DU-initiating character)}
\label{fig:lf}
\end{figure*}

In the weak supervised approach, we use {\sc skweak}'s ability to build a generative model from noisy labels provided by the labeling rules. {\sc skweak} allows to choose an HMM to perform this sequence labeling task. While this approach can be adopted without annotated data, a small development set is useful for testing and crafting the heuristic labeling rules. We can decide more efficiently which manual rules should be retained, dropped or improved thanks to the metrics that are computed on the development set. Besides precision, recall and f-score, {\it overlaps} and {\it conflicts} (with other rules) metrics are also useful to take decisions over the usage of these rules (See table \ref{tab:lr_profiles}).\\ 


\begin{table}
\begin{tabular}{|c|c|c|c|c|}
\hline
name & label & conflict &	precision &	recall \\ \hline
\#\_begpos &	BDU &	0.14 &	0.86 &	0.19 \\ \hline
turn              &	BDU & 0.16 &	0.84 & 0.11 \\ \hline
beg\_char 	  & BDU & 0.25 &	0.75 &	0.21 \\ \hline
conj 	  & BDU &	0.36& 	0.64 &	0.24 \\ \hline
\end{tabular}
\caption{Profiles for a few labeling rules}
\label{tab:lr_profiles}
\end{table}

To summarize, the weakly supervised approach is performed  as follows:
\begin{enumerate}
\item write the labeling rules (See Figure \ref{fig:lf}) ;
\item apply and evaluate them on the {\tt dev set} (iterate with the previous step until satisfied with labeling rules profiles on {\tt dev set}) (See profiles in Table \ref{tab:lr_profiles});
\item apply the labeling rules to the {\tt train set}; 
\item fit the HMM {\sc skweak} (rules aggregation) model;
\item apply the resulting model to the {\tt test set}.
\end{enumerate}

For the time being, the labeling rules crafted are extremely simple. They are using (i) pause duration and turn information; (ii) frequent tokens present at discourse boundaries; (iii) POS-tags over-represented at discourse boundaries. Moreover, manually annotated prosodic units boundaries are included in the dataset and we use them for some experiments. As mentioned above, POS-tags are encoded in a multihot format. The labeling rules exploiting POS are formulated accordingly to this ambiguous situation.

\paragraph{Characters vs. letters}
The corpus we are working with includes two versions of the transcription: characters and romanization (as seen in example \ref{du_examples}). All our experiments were realized in both written forms.

\paragraph{Prosodic boundaries}
This corpus comes with prosodic break expert manual annotations. For the gold dataset, we created two versions of the dataset : one without any kind of prosodic information; and one with a special token corresponding to the presence / absence of a prosodic break. This special token was added to the transcript in all datasets (train / test / dev).\\

% -  (I am not sure whether we should add the syllables... I should do have at least to try to see how it changes the task and the pipeline)\\
\section{Results}
\label{sec:results}

\begin{figure}
\centering
\subcaptionbox{Precision}{
\includegraphics[width=0.45\textwidth]{pics/results_prec.png}}\\%
\subcaptionbox{Recall}{\includegraphics[width=0.45\textwidth]{pics/results_rec.png}}\\%
\subcaptionbox{F-score}{\includegraphics[width=0.45\textwidth]{pics/results_fscore.png}}%}%
\caption{Supervised vs. Weakly-supervised. {\it blue : 200ms pause baseline; orange : romanized; green: characters. From left to right \_1:1\% training data ($\sim$700 toks), \_10:$\sim$7K toks), \_100:$\sim$70K toks)}}
\label{fig:results}
\end{figure}

The results comparing the general approach are presented in figure \ref{fig:results}; the one related to the impact of the written form used are in figures \ref{fig:results_units} and \ref{fig:results_pc} and the results of the prosody experiments are visualized in \ref{fig:results_pros}. All the numbers can be checked in Annex \ref{tab:results}.


\paragraph{Supervision or weak-supervision}
Our results\footnote{In all the paper, the significance labels included in the figures are corresponding to {\it p-values} of a {\it t-test} done on the folds of the experiment. A difference between two conditions is said to be significant (*/**/***) if t-testing the two series of values coming from the folds for both conditions, yielded the corresponding threshold p-values (0.05 / 0.01 / 0.001).} (presented in Figure \ref{fig:results}) shows that our weak-supervision approach remains behind from the supervised approach. This is true with large amount of manually annotated training data ($\sim$70K tokens)\footnote{For characters, supervised approach gives an f-score of $78.7$ (p:$77.0$/r:$80.5$) while weak supervision only reaches a $52.0$ f-score (p:$55.7$/r:$50.4$).} but the difference is already significant with smaller amounts of training data ($\sim$7K tokens) for precision, recall and f-score (P:$70.8$/R:$63.0$/ F:$66.7$). Weak supervision does better only if extremely limited amount of training data is available ($\sim$700 tokens). 
%Also the weak supervision does only slightly better that the 200ms pause baseline.

\begin{figure}
\centering
\subcaptionbox{Precision}{
\includegraphics[width=0.45\textwidth]{pics/results_unit_prec.png}}%
\\
\subcaptionbox{Recall}{\includegraphics[width=0.45\textwidth]{pics/results_unit_rec.png}}%
\\\
\subcaptionbox{F-score}{\includegraphics[width=0.45\textwidth]{pics/results_unit_fscore.png}}%}%
\caption{Characters vs. Romanized. {\it blue: 200ms pause baseline; orange: romanized; green: characters.  From left to right \_1:1\% training data ($\sim$700 toks), \_100:$\sim$70K toks)}}
\label{fig:results_units}
\end{figure}


\paragraph{Which base units?}
\begin{CJK*}{UTF8}{gbsn}
The results of the experiments show that different written forms (characters vs. romanized) for the corpus yielded signicantly different results. The difference between the two versions of the corpus lies in the fact some romanized tokens correspond to several characters (e.g., `ah' corresponds to  `啊', an utterance-initial/final particle, and `矣', a sentence-final particle and perfective aspect marker; `e5' corresponds to `的', a possessive marker and sentence-final particle, `个', a classifier, and `鞋', a noun for `shoe'.), while there are also some, but much less, characters that correspond to different romanizations (e.g., `嘛' correspond to `ma7', which means `also', and `mah', a final particle). This situation conduced us to propose several hypotheses. First of all, when there is not a lot of fine-tuning data, having less symbol types can help to get faster  a robust model. When more annotated data is available, having more specific symbols should bring better results by revolving some ambiguities. However, a second fact to consider is that the LLM we are fine-tuning {\sc (Roberta)} includes Mandarin Chinese but not Southern Min. We therefore hypothesized that the character version should have an advantage when very little amount is provided since the base symbols are present in the model to fine-tune while the romanized symbols featuring tone digits should be something completely new for the model.\\
\end{CJK*}

\begin{figure*}
\centering
\subcaptionbox{Precision}{
\includegraphics[width=0.33\textwidth]{pics/results_pcline_prec.png}}%
\hfill
\subcaptionbox{Recall}{\includegraphics[width=0.33\textwidth]{pics/results_pcline_rec.png}}%
\hfill
\subcaptionbox{F-score}{\includegraphics[width=0.33\textwidth]{pics/results_pcline_fscore.png}}%}%
\caption{Amount of training data. {\it orange: romanized corpus ; green: character version. From 1\% training data ($\sim$700 toks) to 100\% ($\sim$70K toks). Dotted lines, blue: baseline, green and orange : weak supervision}
}
\label{fig:results_pc}
\end{figure*}


The results presented in Figure \ref{fig:results_units} show an advantage to character based corpus  with large amount of fine-tuning data (Characters: $77.0/80.5/78.7$ ; Romanized: $72.5/75.1/73.6$). It seems to be also the case when little amount of data is provided but this difference did not reach statistical significance. There also seems to be some complexities where we could expect to find a sweet spot for the romanized version (a little data for fine-tuning but not a lot, see the precision and recall with 5\% and 10\% of training data on figure \ref{fig:results_pc}) but the numbers do not allow to conclude on this result.
% Still ongoing : check the differences + maybe try to have an idea of significance
%This is actually the pattern we observe in the data. When extremely low amount of fine-tuning data is provided (1\% \ 750 tokens \ 100 DUs) what we obsver




\begin{figure}
\centering
\includegraphics[width=0.45\textwidth]{pics/results_pros_fscore.png}
\caption{Adding prosody. F-score}
\label{fig:results_pros}
\end{figure}

\paragraph{Potential help from prosody}
Prosody information used in this study had been manually added. As explained above, this prosodic annotation is however completely independent from the discourse segmentation. From a linguistic perspective, prosody should help in segmenting discourse units in speech since segmentation is one of the linguistic function of prosody \cite{swerts1997prosodic,hirschberg1992intonational,degand2009identifying,di2013prosodie}. 
However, the recent work of \cite{Gravellier2021}, realized in a similar framework as ours, did not show the benefit of adding prosodic-acoustic cues for performing discourse segmentation. This was based however on automatic acoustic extraction. Given the data available to us, we decided to test whether "gold" prosodic segmentation would help on discourse segmentation performance. More precisely, every token in our dataset carries the information of whether it is at the beginning of a prosodic unit or not.% We can therefore easily add prosody based labeling rules for producing the noisily annotated dataset by simply checking this prosodic information and suggesting discourse boundary where a prosodic boundary occurs.\\




%This information can be used only as a latent information at decoding time if the prosodic information is not included present in the test set and / or not included in the fine-tuned language model. 
The base model we used did not allow for an enrichment at the token level. We therefore translated the prosodic information into a token. More precisely, for each start of labeled prosodic unit we inserted a rare character in the transcript. The figure \ref{fig:results_pros} illustrates the statistically significant benefit of adding prosodic information for the characters and romanized versions of the corpus. The increase for the character version was $+4.5$,$+2.5$ and $+3.5$ for precision, recall and f-score respectively. These increases might seem modest but one should remember that pause duration and turn information was already taken into account before exploiting these prosodic labels.

\begin{figure}
\centering
\subcaptionbox{DU/PU-initial `ah'}{
\includegraphics[width=0.5\textwidth]{pics/high_ah_bi3.png}}
\\
\subcaptionbox{DU/PU-final `ah'}{\includegraphics[width=0.5\textwidth]{pics/low_ah_bi4.png}}%
\caption{Illustration of prosodic help to discourse unit segmentation: (a) The particle `ah' being used as a DU-initial marker is coincided with an intermediate phrase break (BI-3) signaled by pitch reset, i.e., higher f0 at `ah'. (b) The particle is DU-final and exhibit lengthening and continued f0 declination with the preceding syllable, both of which are characteristics of an intonational phrase boundary (BI-4).}
\label{fig:ex_prosody}
\end{figure}

%\review{The figures with boxplots and charts are hard to read. I would recommend making all these figures single column (like Figure 6) and placing the subfigures (e.g., (a), (b), (c)) one on top of each other, which will allow them to be a bit larger. Figure 7 is almost illegible. The use of color in the boxplots is problematic for accessibility reasons: it's ok to use color but please give additional visual clues to distinguish things for those who can't see color or print the paper in black \& white.}

\section{Error Analysis}

To further understand how our models could be improved we performed a detailed qualitative error analysis of the various models output.\\ 
%non-standard units (disfluencies, units without a main verb, abandoned units)

(\ref{ex_boyfriend}) is an example where the model trained on gold and WS show the same segmentation error: While the gold annotation does not segment this sequence into two DUs, the models put a boundary after the sentence-final particle `oh' and a pause. It is a representative example on the overuse of pause as a segmentation cue, especially for the WS-trained model. It also shows that the human annotator has a stronger tendency to only segment DUs with a main verb (thus `reversely my only friend oh' is not a DU) while also neglecting potential disfluencies and false starts (`reversely is'). It is worth noting that while the literal word sequence contains `reversely is', the whole phrase has the same interpretation as `reversely'. The presence of complex adverbs and/or discourse markers is likely another reason that this task is challenging for the models. %I may change this example because the `reversely is' could be dinged as a segmentation problem? (and it kind of is... it's some grammaticalized used of the IS verb...

\begin{pexample} `On the other hand, my boyfriend oh he would still gone to see me' (GEN: genitive marker; PART: a marker similar to ba5 in Mandarin ba construction.)  \label{ex_boyfriend}
    \a Gold annotation: [ah reversely is \# reversely is I GEN boy friend oh \# he still would go PART me see]
    \a Gold \& WS-trained: [ah reversely is \# reversely is I GEN boy friend oh \#] [he still would go PART me see]
\end{pexample}

(\ref{ex_boyfriend_short}) is another example where the gold-trained model oversegmented a DU that was viewed by the human annotator as a noun and a relative clause (`The boyfriends that I had'). 

\begin{pexample} `The boyfriends that I had I always didn't marry them'   \label{ex_boyfriend_short}
    \a Gold annotation (and WS-trained): [I self have GEN boy friend all all marry no success]
    \a Gold trained:  [I self have GEN boy friend] [all all marry no success]
\end{pexample}


Finally, (\ref{ex_walking}) shows an example of how gold-trained and WS-trained segmentation may differ from the gold annotation in distinct ways. The gold annotation has a DU boundary between the main clause and the tag question, the former containing some disfluencies. The model trained on gold annotation did not recognize the boundary with the tag question and instead put a boundary before the word `like this' (an2-ne), which reflects the fact that an2-ne is a discourse marker that can occur in clause-initial and clause-final positions. The model trained on WS data, on the other hand, did not put a DU boundary for the entire sequence (thus having an error of under-segmentation before `you know not'), as there was no pause nor words that have a strong tendency to start a DU in the corpus.

\begin{pexample} `At that time, walking still didn't require tiptoeing, you know?' (hyphen-connected units denote a word in TSM). \label{ex_walking}
    \a Gold annotation: [Then walking still does-not like this does-not require tiptoeing] [you know not]
    \a Gold-trained: [Then walking still does-not] [like-this does-not require tiptoeing you know not]
    \a WS-trained: [Then walking still does-not like-this does-not require tiptoeing you know not]
\end{pexample}

\section{Discussion and Future Work}
%Future : 
%Syllables, apply the model to the larger not annotated dataset
%Refine labelling rules
%****TODO : discuss positively the usability of these models for creating large dataset annotated at discourse level 
In this paper, we applied state-of-art techniques of discourse segmentation to a dataset of Taiwan Southern Min. We compared supervised and weakly supervised approaches. Moreover the linguistic information included in the original dataset allowed us to test some hypotheses along the way. We tested whether (i) it was easier to segment with the character-based or romanized version of the corpus ; and (ii) prosodic gold labels could help these new models of discourse segmentation.\\



An important overall result is that the approach employed (fine-tuning a sequence-to-sequence model) performs extremely well on this Taiwan Southern Min corpus, a language not included in the base Language Model (LLM) used. This is an important result with regard to the applicability of such approaches to low-resource languages for this task. The longer term goal of this work is to apply the best model we can build to a much larger corpus of Taiwanese interviews. The results obtained enable us to try to replicate existing studies on discourse-prosody interface in spontaneous speech, which have relied solely on manually annotated data. %Furthermore, discourse processing of the whole corpus of interviews opens up research questions in terms of computational analysis of the narratives featured in this corpus.\\
% Furthermore, processing the entire corpus of interviews presents a plethora of research questions in terms of computational analysis of the narratives contained within.

Getting into the comparison of the two approaches tested, we should remind here that the scores obtained with gold annotations should be taken as a top line for the weak supervision approach. Indeed, the amount of manual gold segmentation for this corpus is substantial and does not aligh with the typical scenario for adopting a weak-supervision approach. With this consideration in mind, we observe that the weakly supervised approach failed to produce comparable results to the supervised setting. This can be attributed on the one hand to the supervised approach yielding highly competitive results through fine-tuning with only about 10\% of our full amount of annotated data (corresponding $7K$ tokens, $700$ discourse units); and on the other hand to the relatively low performance of our weakly supervised model.  However, this does not negate the potential interest of weak supervision. Our current rules are rudimentary, primarily using simple pauses, tokens information and ambiguous POS-tags. We intend to enhance these labeling rules in several directions: (i) using a real POS-tagger that would reduce ambiguity ; (ii) developing more sophisticated labeling rules to address phenomena specific to spontaneous speech, such as disfluencies.\\

Regarding the comparison between the character-based and romanized versions of the corpus, the clear conclusion is that the character version consistently yields better results regardless of the amount of fine-tuning data provided. This could be attributed to both the benefit of lower ambiguities of characters over romanized version and to the presence of Mandarin data in {\sc Roberta}.\\

Regarding prosody, this study has shown that, in line with linguistic predictions and previous computational models, but contrary to recent findings on this task, prosodic information can indeed help in discourse unit segmentation. The next obvious step is to automatize the extraction of relevant acoustic features that approximate efficiently the manual annotations we had in this stydy. From the primary prosodic features identified in \cite{shriberg2000prosody} for English, excluding the ones already exploited by our pause and turn related rules, we identify (i) pitch differences across the discourse unit boundary, and (ii) duration of phones and rhymes preceding the decision point.\\

\section*{Acknowledgments}
The authors would like to thank Philippe Muller and Pierre Magistry for side discussions related to the content of the paper. This study was supported by the Institute of Linguistics, Academia Sinica (LING-113-018-01), Taiwan's National Science and Technology Council (NSTC-112-2410-H-001-098-MY2) as well as by the Institut Convergence ILCB (ANR-16-CONV-0002) and the ANR-Funded project SUMM-RE (ANR-20-CE23-0017).

%\newpage
% Entries for the entire Anthology, followed by custom entries
\bibliography{anthology,custom}

\newpage

\onecolumn

\appendix

\section{Appendix}
\label{sec:appendix}


\subsection{Global Results}

\begin{figure*}[hbt]
\centering
\subcaptionbox{Precision}{\includegraphics[width=0.45\textwidth]{pics/results_prec_glob.png}}%
\hfill
\subcaptionbox{Recall}{\includegraphics[width=0.45\textwidth]{pics/results_rec_glob.png}}%
\subcaptionbox{F-score}{\includegraphics[width=0.45\textwidth]{pics/results_fscore_glob.png}}%}%
\caption{Global Results {\it blue: baseline, orange: romanized corpus ; green: character version}}
\label{fig:results_global}
\end{figure*}

\begin{table*}[hbt]
\begin{tabular}{|l|r|r|r|r|r|r|}
\hline
& prec mean & prec std & rec mean & rec std & fscore mean & fscore std \\ \hline
pause baseline (200ms) & 0.486618 & 0.060169 & 0.529578 & 0.068400 & 0.504385 & 0.050271 \\ \hline
super. rom (700 toks) & 0.616545 & 0.061804 & 0.344490 & 0.081643 & 0.435640 & 0.067387 \\ \hline
super. char (700 toks) & 0.652257 & 0.063328 & 0.398917 & 0.065834 & 0.490842 & 0.053958 \\ \hline
weakly super. rom & 0.601497 & 0.031159 & 0.524181 & 0.077477 & 0.557128 & 0.047371 \\ \hline
weakly super. char & 0.556877 & 0.064321 & 0.503797 & 0.098992 & 0.519769 & 0.055981 \\ \hline
super. rom (7K) & 0.654762 & 0.054972 & 0.601636 & 0.058013 & 0.624031 & 0.036572 \\ \hline
super. char (7K) & 0.707989 & 0.049716 & 0.629861 & 0.049157 & 0.666265 & 0.046354 \\ \hline
super. rom (70K)& 0.724710 & 0.040760 & 0.750888 & 0.052945 & 0.735763 & 0.028225 \\ \hline
super. char (70K) & 0.770644 & 0.020731 & 0.804883 & 0.036518 & 0.787142 & 0.025453 \\ \hline
super. rom (70K) + pros & 0.757477 & 0.027094 & 0.792695 & 0.034534 & 0.774099 & 0.020699 \\ \hline
super. char (70K) + pros & 0.814579 & 0.031807 & 0.829729 & 0.029347 & 0.821556 & 0.020996 \\ \hline
\end{tabular}
\caption{Global Results}
\label{tab:results}
\end{table*}


\newpage
\subsection{Tokens and POS lists used in the labelling rules}
\subsubsection{POS list}
\begin{verbatim}
BEGIN_POS = ['interjection']
END_POS = ['interjection', 'onomatopoeia', 'particle']
NON_BEGIN_POS = ['interrogative', 'locative', 'numeral', 'onomatopoeia', 'quantifier']
NON_END_POS = ['adposition', 'conjunction', 'numeral', 'pronoun']
\end{verbatim}

\subsubsection{Romanized token lists}
\begin{verbatim}
BEGIN_UNI_ROM = ['tan7-si7', 'li5-chhiann2', 'sou2-i2', 'henn', 'ran2m-houm']
END_UNI_ROM = ['lah', 'bo', 'mah', 'neh', 'nia5', 'm']
BEGIN_BI_ROM = ['ah chit-ma2',  'ah na7', 'henn ah', 'li2 e7', 'ah i', 'in-ui7 li2',
                'sou2-i2 gun2', 'ah ma7', 'sou2-i2 goa2', 'ah cho3', 'tan7-si7 goa2',
                'ah si7','ah m7-koh','henn goa2','oh he','ah hit-chun7','ah chiah',
                'tioh8 bo']
END_BI_ROM = ['bo5 lah', 'ni5 ah', 'u7 ah', 'e5 lah', 'ho2 chiah8', 'bo5 ah','ah lah',
              'tioh8 ah', 'si5-chun7 honn', 'lah honn', 'henn ah', 'an2-ne lah',
              'goa2 kam2-kak', 'khi3 ah', 'kam2-kak kong2', 'an2-ne nia5', 'e5 an2-ne',
              'koe3 ah', 'tioh8 lah', 'ho2 ah', 'e5 oh', 'chai-iann2 kong2', 'e5 neh',
              'kang5-khoan2 ah', 'ho2 lah', 'an2-ne honn', 'tioh8 bo']
\end{verbatim}

\section{Labelling Rules}
\subsection{More examples}
\begin{verbatim}
def very_long_pause(doc):
    for idx, token in enumerate(doc):
        if idx > 0:
            if doc[idx-1].text in PAUSE_TOK and doc[idx-1]._.dur > VERY_LONG_PAUSE:
                yield idx,idx+1,'BDU'
            else:
                yield idx,idx+1,'ABS'
        else:
            yield idx,idx+1,'BDU' #beginning of doc
\end{verbatim}

\begin{verbatim}
def begin_pos(doc):
    for idx, token in enumerate(doc):
        if idx > 0:
            for cat in string_to_list(doc[idx]._.pos_list):
                if cat in BEGIN_POS:
                    yield idx,idx+1,'BDU'
            yield idx,idx+1,'ABS'
        else:
            yield idx,idx+1,'ABS'   
\end{verbatim}

\subsection{Labeling Functions profles (Romanized)}
\begin{table}[hbt]
\centering
\begin{tabular}{|l|l|l|r|r|r|r|}
\hline
 &            annotator & label &  conflict &  precision &  recall &     f1 \\ \hline
 1  &          non\_end\_pos &    NO &     0.028 &      0.991 &   0.252 &  0.401 \\  \hline
 2  &        non\_begin\_pos &    NO &     0.112 &      0.970 &   0.070 &  0.130 \\  \hline
 3  &      cluster\_rom\_neg &    NO &     1.000 &      0.700 &   0.001 &  0.002 \\  \hline
 5  &  pause\_ending\_bi\_rom &   BDU &     0.109 &      0.927 &   0.048 &  0.092 \\  \hline
 6  &      pause\_begin\_pos &   BDU &     0.112 &      0.888 &   0.082 &  0.151 \\  \hline
 7  &         begin\_bi\_rom &   BDU &     0.121 &      0.888 &   0.090 &  0.163 \\  \hline
 8  &   pause\_begin\_bi\_rom &   BDU &     0.121 &      0.879 &   0.048 &  0.091 \\  \hline
 9  &         pause\_endrom &   BDU &     0.200 &      0.875 &   0.033 &  0.064 \\  \hline
 10 &                 turn &   BDU &     0.158 &      0.842 &   0.111 &  0.196 \\  \hline
 11 &             beginrom &   BDU &     0.180 &      0.839 &   0.172 &  0.286 \\  \hline
 12 &        extreme\_pause &   BDU &     0.181 &      0.826 &   0.116 &  0.204 \\  \hline
 13 &       pause\_beginrom &   BDU &     0.181 &      0.819 &   0.064 &  0.119 \\  \hline
 14 &      cluster\_rom\_pos &   BDU &     0.200 &      0.800 &   0.008 &  0.015 \\  \hline
 15 &               endrom &   BDU &     0.318 &      0.773 &   0.016 &  0.032 \\  \hline
 16 &      very\_long\_pause &   BDU &     0.263 &      0.741 &   0.144 &  0.241 \\ \hline
 17 &           long\_pause &   BDU &     0.417 &      0.588 &   0.235 &  0.335 \\  \hline
 18 &        pause\_end\_pos &   BDU &     0.463 &      0.551 &   0.148 &  0.233 \\ \hline
 19 &        ending\_bi\_rom &   BDU &     0.490 &      0.530 &   0.101 &  0.170 \\ \hline
 20 &          conjunction &   BDU &     0.494 &      0.525 &   0.128 &  0.205 \\ \hline
 21 &                pause &   BDU &     0.490 &      0.514 &   0.336 &  0.406 \\ \hline
 22 &          short\_pause &   BDU &     0.583 &      0.424 &   0.520 &  0.467 \\ \hline
 23 &            begin\_pos &   BDU &     0.597 &      0.410 &   0.160 &  0.230 \\ \hline
 \end{tabular}
 \caption{Label Functions profiles for Romanized version}
 \end{table}
 
\end{document}
\subsubsection{Characters token lists}

\begin{CJK*}{UTF8}{gbsn}
BEGIN\_UNI\_ROM = ['但是', '然後', '而且', '所以', 'henn']\\
END\_UNI\_ROM = ['啦', '矣', '呢', '吧', '爾']\\
BEGIN\_BI\_ROM = ['啊 這馬',  '啊 若', 'henn 啊', '啊 伊', '因為 你', '所以 阮','所以 我', '啊 嘛', '啊 做',
 '但是 我','啊 是','啊 毋過','henn 我','喔 彼','啊 彼陣','著 無','啊 才']\\
END\_BI\_ROM =['無 啦',
 '年 矣',
 '有 啊',
 '的 啦',
 '矣 啦',
 '著 啊',
 '時陣  honn',
 '啦  honn',
 '著 矣',
 '有 無',
 'henn 啊',
 '袂 啦',
 '按呢 啦',
 '我 感覺',
 '去 矣',
 '感覺 講',
 '創 啥',
 '按呢 爾',
 '的 按呢',
 '好 啦',
 '著 啦',
 '共 講',
 '好 矣',
 '的 喔',
 '好 啊',
 '知影 講',
 '做 啥',
 '的 呢',
 '仝款 啊',
 '按呢  honn',
 '著  無 ']
\end{CJK*}