#!/usr/bin/env python2.7
'''
Converts tagged data where there is a special "extender" 
tag to the BIO scheme. E.g., if ~ is the extender, then 
the tag sequence 
  PER ~ ~ ORG ~ O
will become 
  B-PER I-PER I-PER B-ORG I-ORG B-O

Note that all output tags will have the prefix B- or I-.

The input is of the form (token, tag1, tag2, other...), 
with tab-separated fields. Only 'tag1' and 'tag2' are 
transformed.

Usage: EXTENDER INPUT
'''
from __future__ import print_function, division
import sys, os, re, codecs, fileinput

EXT = sys.argv[1]

def transform(tag, ptag):
    assert ptag!=EXT
    assert ptag is None or ptag[:2] in ('B-','I-')
    return 'I-'+ptag[2:] if tag==EXT else 'B-'+tag

first = True
pparts = None
for ln in fileinput.input(sys.argv[2:]):
    ln = ln[:-1]
    if ln:
        parts = ln.split('\t')
        parts[1] = transform(parts[1], None if pparts is None else pparts[1])
        parts[2] = transform(parts[2], None if pparts is None else pparts[2])
    if first:
        first = False
    else:
        print(*pparts if pparts else '', sep='\t')

    pparts = parts if ln else None

print(*pparts if pparts else '', sep='\t')
