Syllable Parsing - Work in Progress

Mark Huckvale (mark@phonetics.ucl.ac.uk)
Tue, 03 Mar 1998 18:49:04 +0000

--=====================_888950944==_
Content-Type: text/plain; charset="us-ascii"

Attached are four components of the lexical parser. This
is the program that I am developing to take the SAMPA transcription
in the lexicon and expand them to syllable structure.

Part1: listing of vowel and consonant features for SAMPA transcription
Part2: list of legal onsets and offsets
Part3: XML DTD for a lexicon entry
Part4: partially completed lexical entries for two words

I have yet to:
- put features on <SYL>, <ONSET>, <RHYME>, <NUC> and <CODA> tags
based on the segments beneath them
- deal with syllabic /l/
- deal with linking /r/
- make agreements of CNSVOC etc within onset/coda

But nevertheless, comments on what we have so far welcome!

Mark

--=====================_888950944==_
Content-Type: text/plain; charset="iso-8859-1"
Content-Transfer-Encoding: quoted-printable
Content-Disposition: attachment; filename="Example.txt"

------------------------------------------------------------------------
/* segment list -- SCRIBE broad class annotations */

#define SIL 1
#define NUC 2
#define SKIP 4
#define LONG 8
#define STRESS1 16
#define STRESS2 32
#define LINKR 64

struct segment_rec {
char *sym;
int attr;
char *elem;
char *elem2;
};

static struct segment_rec segtab[]=3D{
{ " ", SKIP, "" },
{ "'", SKIP|STRESS1, "" },
{ ",", SKIP|STRESS2, "" },
{ "3", NUC|LONG, "<VOC GRV=3DY HEIGHT=3DMID RND=3DN>3</VOC>", "<VOC GRV=3DY=
HEIGHT=3DMID RND=3DN>3</VOC>" },
{ "=3Dl", NUC, "<CNS CNT=3DY VOI=3DY NAS=3DN STR=3DN SON=3DY RHO=3DN=
CNSGRV=3DN CNSCMP=3DN VOCHEIGHT=3DCLOSE VOCRND=3DN>l</CNS>", "" },
{ "=3Dm", NUC, "<CNS CNT=3DN VOI=3DY NAS=3DY STR=3DN SON=3DY RHO=3DN=
CNSGRV=3DY CNSCMP=3DN>m</CNS>", "" },
{ "=3Dn", NUC, "<CNS CNT=3DN VOI=3DY NAS=3DY STR=3DN SON=3DY RHO=3DN=
CNSGRV=3DN CNSCMP=3DN>n</CNS>", "" },
{ "=3Dr", NUC, "<CNS CNT=3DY VOI=3DY NAS=3DN STR=3DN SON=3DY RHO=3DY=
CNSGRV=3DN CNSCMP=3DN VOCGRV=3DY VOCHEIGHT=3DMID>r</CNS>", "" },
{ "@", NUC, "<VOC GRV=3DY HEIGHT=3DMID RND=3DN>@</VOC>", "<VOC GRV=3DY=
HEIGHT=3DMID RND=3DN>@</VOC>" },
{ "@U", NUC|LONG, "<VOC GRV=3DY HEIGHT=3DMID>@</VOC>", "<VOC GRV=3DY=
HEIGHT=3DCLOSE RND=3DY>U</VOC>" },
{ "A", NUC|LONG, "<VOC GRV=3DY HEIGHT=3DOPEN RND=3DN>A</VOC>", "<VOC GRV=3DY=
HEIGHT=3DMID RND=3DN>A</VOC>" },
{ "D", 0, "<CNS CNT=3DY VOI=3DY NAS=3DN STR=3DN SON=3DN RHO=3DN CNSGRV=3DN=
CNSCMP=3DN>D</CNS>", "" },
{ "I", NUC, "<VOC GRV=3DN HEIGHT=3DCLOSE RND=3DN>I</VOC>", "<VOC GRV=3DN=
HEIGHT=3DCLOSE RND=3DN>I</VOC>" },
{ "I@", NUC|LONG, "<VOC GRV=3DN HEIGHT=3DCLOSE RND=3DN>I</VOC>", "<VOC=
HEIGHT=3DMID RND=3DN>@</VOC>" },
{ "N", 0, "<CNS CNT=3DN VOI=3DY NAS=3DY STR=3DN SON=3DY RHO=3DN CNSGRV=3DY=
CNSCMP=3DY>N</CNS>", "" },
{ "O", NUC|LONG, "<VOC GRV=3DY HEIGHT=3DMID RND=3DY>O</VOC>", "<VOC GRV=3DY=
HEIGHT=3DMID RND=3DY>O</VOC>" },
{ "OI", NUC|LONG, "<VOC GRV=3DY HEIGHT=3DMID RND=3DY>O</VOC>", "<VOC GRV=3DN=
HEIGHT=3DCLOSE RND=3DN>I</VOC>" },
{ "Q", NUC, "<VOC GRV=3DY HEIGHT=3DMID RND=3DY>Q</VOC>", "<VOC GRV=3DY=
HEIGHT=3DMID RND=3DY>Q</VOC>" },
{ "R", SKIP|LINKR, "" },
{ "S", 0, "<CNS CNT=3DY VOI=3DN NAS=3DN STR=3DY SON=3DN RHO=3DN CNSGRV=3DN=
CNSCMP=3DY>S</CNS>", "" },
{ "T", 0, "<CNS CNT=3DY VOI=3DN NAS=3DN STR=3DN SON=3DN RHO=3DN CNSGRV=3DN=
CNSCMP=3DN>T</CNS>", "" },
{ "U", NUC, "<VOC GRV=3DY HEIGHT=3DCLOSE RND=3DY>U</VOC>", "<VOC GRV=3DY=
HEIGHT=3DCLOSE RND=3DY>U</VOC>" },
{ "U@", NUC|LONG, "<VOC GRV=3DY HEIGHT=3DCLOSE RND=3DY>U</VOC>", "<VOC=
HEIGHT=3DMID RND=3DN>@</VOC>" },
{ "V", NUC, "<VOC GRV=3DY HEIGHT=3DOPEN RND=3DN>V</VOC>", "<VOC GRV=3DY=
HEIGHT=3DOPEN RND=3DN>V</VOC>" },
{ "Z", 0, "<CNS CNT=3DY VOI=3DY NAS=3DN STR=3DY SON=3DN RHO=3DN CNSGRV=3DN=
CNSCMP=3DY>Z</CNS>", "" },
{ "aI", NUC|LONG, "<VOC GRV=3DY HEIGHT=3DOPEN RND=3DN>a</VOC>", "<VOC GRV=3D=
N HEIGHT=3DCLOSE RND=3DN>I</VOC>" },
{ "aU", NUC|LONG, "<VOC GRV=3DY HEIGHT=3DOPEN RND=3DN>a</VOC>", "<VOC GRV=3D=
Y HEIGHT=3DCLOSE RND=3DY>U</VOC>" },
{ "b", 0, "<CNS CNT=3DN VOI=3DY NAS=3DN STR=3DN SON=3DN RHO=3DN CNSGRV=3DY=
CNSCMP=3DN>b</CNS>", "" },
{ "d", 0, "<CNS CNT=3DN VOI=3DY NAS=3DN STR=3DN SON=3DN RHO=3DN CNSGRV=3DN=
CNSCMP=3DN>d</CNS>", "" },
{ "dZ", 0, "<CNS CNT=3DN VOI=3DY NAS=3DN STR=3DY SON=3DN RHO=3DN>dZ</CNS>",=
"" },
{ "e", NUC, "<VOC GRV=3DN HEIGHT=3DMID RND=3DN>e</VOC>", "<VOC GRV=3DN=
HEIGHT=3DMID RND=3DN>e</VOC>" },
{ "e@", NUC|LONG, "<VOC GRV=3DN HEIGHT=3DMID RND=3DN>e</VOC>", "<VOC=
HEIGHT=3DMID RND=3DN>@</VOC>" },
{ "eI", NUC|LONG, "<VOC GRV=3DN HEIGHT=3DMID RND=3DN>e</VOC>", "<VOC GRV=3DN=
HEIGHT=3DCLOSE RND=3DN>I</VOC>" },
{ "f", 0, "<CNS CNT=3DY VOI=3DN NAS=3DN STR=3DY SON=3DN RHO=3DN CNSGRV=3DY=
CNSCMP=3DN>f</CNS>", "" },
{ "g", 0, "<CNS CNT=3DN VOI=3DY NAS=3DN STR=3DN SON=3DN RHO=3DN CNSGRV=3DY=
CNSCMP=3DY>g</CNS>", "" },
{ "h", 0, "<CNS CNT=3DY VOI=3DN NAS=3DN STR=3DY SON=3DN RHO=3DN CNSGRV=3DY=
CNSCMP=3DY>h</CNS>", "" },
{ "i", NUC|LONG, "<VOC GRV=3DN HEIGHT=3DCLOSE RND=3DN>i</VOC>", "<VOC GRV=3D=
N HEIGHT=3DCLOSE RND=3DN>i</VOC>" },
{ "j", 0, "<CNS CNT=3DY VOI=3DY NAS=3DN STR=3DN SON=3DY RHO=3DN CNSGRV=3DN=
CNSCMP=3DY VOCGRV=3DN VOCHEIGHT=3DCLOSE VOCRND=3DN>j</CNS>", "" },
{ "k", 0, "<CNS CNT=3DN VOI=3DN NAS=3DN STR=3DN SON=3DN RHO=3DN CNSGRV=3DY=
CNSCMP=3DY>k</CNS>", "" },
{ "l", 0, "<CNS CNT=3DY VOI=3DY NAS=3DN STR=3DN SON=3DY RHO=3DN CNSGRV=3DN=
CNSCMP=3DN VOCHEIGHT=3DCLOSE VOCRND=3DN>l</CNS>", "" },
{ "m", 0, "<CNS CNT=3DN VOI=3DY NAS=3DY STR=3DN SON=3DY RHO=3DN CNSGRV=3DY=
CNSCMP=3DN>m</CNS>", "" },
{ "n", 0, "<CNS CNT=3DN VOI=3DY NAS=3DY STR=3DN SON=3DY RHO=3DN CNSGRV=3DN=
CNSCMP=3DN>n</CNS>", "" },
{ "p", 0, "<CNS CNT=3DN VOI=3DN NAS=3DN STR=3DN SON=3DN RHO=3DN CNSGRV=3DY=
CNSCMP=3DN>p</CNS>", "" },
{ "r", 0, "<CNS CNT=3DY VOI=3DY NAS=3DN STR=3DN SON=3DY RHO=3DY CNSGRV=3DN=
CNSCMP=3DN VOCGRV=3DY VOCHEIGHT=3DMID>r</CNS>", "" },
{ "s", 0, "<CNS CNT=3DY VOI=3DN NAS=3DN STR=3DY SON=3DN RHO=3DN CNSGRV=3DN=
CNSCMP=3DN>s</CNS>", "" },
{ "t", 0, "<CNS CNT=3DN VOI=3DN NAS=3DN STR=3DN SON=3DN RHO=3DN CNSGRV=3DN=
CNSCMP=3DN>t</CNS>", "" },
{ "tS", 0, "<CNS CNT=3DN VOI=3DN NAS=3DN STR=3DY SON=3DN RHO=3DN>tS</CNS>",=
"" },
{ "u", NUC|LONG, "<VOC GRV=3DY HEIGHT=3DCLOSE RND=3DY>u</VOC>", "<VOC GRV=3D=
Y HEIGHT=3DCLOSE RND=3DY>u</VOC>" },
{ "v", 0, "<CNS CNT=3DY VOI=3DY NAS=3DN STR=3DY SON=3DN RHO=3DN CNSGRV=3DY=
CNSCMP=3DN>v</CNS>", "" },
{ "w", 0, "<CNS CNT=3DY VOI=3DY NAS=3DN STR=3DN SON=3DY RHO=3DN CNSGRV=3DY=
VOCGRV=3DY VOCHEIGHT=3DCLOSE VOCRND=3DY>w</CNS>", "" },
{ "z", 0, "<CNS CNT=3DY VOI=3DY NAS=3DN STR=3DY SON=3DN RHO=3DN CNSGRV=3DN=
CNSCMP=3DN>z</CNS>", "" },
{ "{", NUC, "<VOC GRV=3DN HEIGHT=3DOPEN RND=3DN>{</VOC>", "<VOC GRV=3DN=
HEIGHT=3DOPEN RND=3DN>{</VOC>" },
};

------------------------------------------------------------------------
/* sylonoff.h -- syllable onsets and offsets */

/* SAMPA version - February 1998 */

const char *SAMPAonset[]=3D{
"p","t","k","b","d","g","tS","dZ","m","n",
"h","f","v","T","D","s","z","S","l","r","w","j",
"sp","st","sk","sm","sn","sf","sl","sr","sw","sj",
"Sr","Sl",
"spl","skl","spr","str","skr","spj","stj","skj",
"pl","kl","bl","gl","pr","tr","kr","br","dr","gr",
"tw","kw","dw","gw","pj","tj","kj","bj","dj","gj",
"mj","nj",
};
#define NUMSAMPAONSET (sizeof(SAMPAonset)/sizeof(char *))

const char *SAMPAoffset[]=3D{
"p","t","k","b","d","g","tS","dZ","m","n","N",
"ps","ts","ks","bz","dz","gz","tS","dZ","mz","nz","Nz",
"f","v","T","D","s","z","S","Z","l",
"fs","vz","Ts","Dz","lz",
"mp","nt","Nk","mb","nd","Ng","ntS","ndZ",
"mps","nts","Nks","mbz","ndz","Ngz",
"lp","lt","lk","lb","ld","lg","ltS","ldZ",
"lps","lts","lks","lbz","ldz","lgz",
"mf","mT","mpT","mps",
"mfs","mTs","mpTs",
"nT","ntT","ntS","ns","nts","nS","nZ","ndZ",
"nTs","ntTs",
"NT","NkT",
"NTs","NkTs",
};
#define NUMSAMPAOFFSET (sizeof(SAMPAoffset)/sizeof(char *))
------------------------------------------------------------------------
<?XML version=3D"1.0"?>
<!DOCTYPE LEXICON [
<!ELEMENT LEXICON (ENTRY)* >
<!ELEMENT ENTRY (HW, POSSEQ?, PRONSEQ?) >
<!ELEMENT HW (#PCDATA) >
<!ELEMENT POSSEQ (POS)* >
<!ELEMENT POS (#PCDATA) >
<!ELEMENT PRONSEQ (PRON)* >
<!ELEMENT PRON (IPA?, SYLSEQ?) >
<!ELEMENT IPA (#PCDATA) >
<!ELEMENT SYLSEQ (SYL)* >
<!ELEMENT SYL (ONSET? ,RHYME) >
<!ELEMENT ONSET (CONS)* >
<!ELEMENT RHYME (NUC, CODA?) >
<!ELEMENT NUC (VOC)* >
<!ELEMENT CODA (CONS)* >
<!ELEMENT VOC (#PCDATA)>
<!ELEMENT CONS (#PCDATA)>
<!ATTLIST ENTRY
id ID #required
href CDATA #required>
<!ATTLIST SYLSEQ
id ID #required
href CDATA #required>
<!ATTLIST SYL
strength (strong|weak) 'weak'
weight (heavy|light) 'light' >
<!ATTLIST ONSET
strength (strong|weak) 'weak' >
<!ATTLIST RHYME
strength (strong|weak) 'weak'
weight (heavy|light) 'light'
checked (Y|N) 'N'
voice (Y|N) 'N'>
<!ATTLIST NUC
strength (strong|weak) 'weak'
weight (heavy|light) 'light'
checked (Y|N) 'N'
voice (Y|N) 'N'
long (Y|N) 'N' >
<!ATTLIST CODA
voice (Y|N) 'N' >
<!ATTLIST CNS
CNT (Y|N) 'N'
VOI (Y|N) 'N'
NAS (Y|N) 'N'
STR (Y|N) 'N'
SON (Y|N) 'N'
RHO (Y|N) 'N'
CNSGRV (Y|N) 'N'
CNSCMP (Y|N) 'N'
VOCHEIGHT (Y|N) 'N'
VOCRND (Y|N) 'N'=20
AMBI (Y|N) 'N' >
<!ATTLIST VOC
GRV (Y|N) 'N'
HEIGHT (Y|N) 'N'
RND (Y|N) 'N' >
<!ATTLIST SEG
AMBI (Y=DDN) 'N' >
]>
------------------------------------------------------------------------

<ENTRY ID=3D'LISA'>
<HW>Lisa</HW>
<POSSEQ>
<POS PRN=3D'#1'>N(prop,sing)</POS>
</POSSEQ>
<PRONSEQ>
<PRON ID=3D'1'>
<IPA ID=3D'1'>'lis@</IPA>
<SYLSEQ>
<SYL>
<ONSET>
<CNS CNT=3D'Y' VOI=3D'Y' NAS=3D'N' STR=3D'N' SON=3D'Y' RHO=3D'=
N' CNSGRV=3D'N' CNSCMP=3D'N' VOCHEIGHT=3D'CLOSE' VOCRND=3D'N'>l</CNS>
</ONSET>
<RHYME>
<NUC>
<VOC GRV=3D'N' HEIGHT=3D'CLOSE' RND=3D'N'>i</VOC>
<VOC GRV=3D'N' HEIGHT=3D'CLOSE' RND=3D'N'>i</VOC>
</NUC>
<CODA>
<CNS AMBI=3D'Y' CNT=3D'Y' VOI=3D'N' NAS=3D'N' STR=3D'Y'=
SON=3D'N' RHO=3D'N' CNSGRV=3D'N' CNSCMP=3D'N'>s</CNS>
</CODA>
</RHYME>
</SYL>
<SYL>
<ONSET>
<CNS AMBI=3D'Y' CNT=3D'Y' VOI=3D'N' NAS=3D'N' STR=3D'Y'=
SON=3D'N' RHO=3D'N' CNSGRV=3D'N' CNSCMP=3D'N'>s</CNS>
</ONSET>
<RHYME>
<NUC>
<VOC GRV=3D'Y' HEIGHT=3D'MID' RND=3D'N'>@</VOC>
<VOC GRV=3D'Y' HEIGHT=3D'MID' RND=3D'N'>@</VOC>
</NUC>
</RHYME>
</SYL>
</SYLSEQ>
</PRON>
</PRONSEQ>
</ENTRY>

<ENTRY ID=3D'LEAD'>
<HW>lead</HW>
<POSSEQ>
<POS PRN=3D'#1'>N(com,sing)</POS>
<POS PRN=3D'#2'>V</POS>
<POS PRN=3D'#2'>N(com,sing)</POS>
</POSSEQ>
<PRONSEQ>
<PRON ID=3D'1'>
<IPA ID=3D'1'>'led</IPA>
<SYLSEQ>
<SYL>
<ONSET>
<CNS CNT=3D'Y' VOI=3D'Y' NAS=3D'N' STR=3D'N' SON=3D'Y' RHO=3D'=
N' CNSGRV=3D'N' CNSCMP=3D'N' VOCHEIGHT=3D'CLOSE' VOCRND=3D'N'>l</CNS>
</ONSET>
<RHYME>
<NUC>
<VOC GRV=3D'N' HEIGHT=3D'MID' RND=3D'N'>e</VOC>
<VOC GRV=3D'N' HEIGHT=3D'MID' RND=3D'N'>e</VOC>
</NUC>
<CODA>
<CNS AMBI=3D'Y' CNT=3D'N' VOI=3D'Y' NAS=3D'N' STR=3D'N'=
SON=3D'N' RHO=3D'N' CNSGRV=3D'N' CNSCMP=3D'N'>d</CNS>
</CODA>
</RHYME>
</SYL>
</SYLSEQ>
</PRON>
<PRON ID=3D'2'>
<IPA ID=3D'2'>'lid</IPA>
<SYLSEQ>
<SYL>
<ONSET>
<CNS CNT=3D'Y' VOI=3D'Y' NAS=3D'N' STR=3D'N' SON=3D'Y' RHO=3D'=
N' CNSGRV=3D'N' CNSCMP=3D'N' VOCHEIGHT=3D'CLOSE' VOCRND=3D'N'>l</CNS>
</ONSET>
<RHYME>
<NUC>
<VOC GRV=3D'N' HEIGHT=3D'CLOSE' RND=3D'N'>i</VOC>
<VOC GRV=3D'N' HEIGHT=3D'CLOSE' RND=3D'N'>i</VOC>
</NUC>
<CODA>
<CNS AMBI=3D'Y' CNT=3D'N' VOI=3D'Y' NAS=3D'N' STR=3D'N'=
SON=3D'N' RHO=3D'N' CNSGRV=3D'N' CNSCMP=3D'N'>d</CNS>
</CODA>
</RHYME>
</SYL>
</SYLSEQ>
</PRON>
</PRONSEQ>
</ENTRY>

--=====================_888950944==_--