#!/usr/bin/bash

## This script processes an entire text file of Finnish 
## with omorfi. Each sentenceis passed word by word through 
## omorfi, and whatever morphemes are returned by omorfi are 
## output in place of the input token. The output is written 
## to std out and should be redirected to an output file 
## when calling this script.

cat /data/parallel/training/europarl-v8.tokenized.fi-en.fi | while read sent; do

	echo $sent | omorfi-analyse-text.sh > finnish

	echo -n "" > output
	new=true
	cat finnish | while read line; do

		if $new ;then

			if (( $(echo $line | wc -w) > 1 )); then
				new=false
				word=$(echo $line | cut -f2 -d' ')
				if [[ $word == *"#"* ]]; then
					i=1
					while [ "$(echo $word | cut -f$i -d'#')" != "" ]; do
						echo -n "$(echo $word | cut -f$i -d'#') " >> output
						((i+=1))
					done
				else
					if [[ $word != "+?" ]]; then
						echo -n "$word " >> output
					else
						echo -n "$(echo $line | cut -f1 -d' ') " >> output
					fi
				fi			
			fi
		fi
		if [ "$line" = "" ]; then
			new=true
		fi
	done
	echo >> output
	cat output
	rm finnish
	rm output
done
