diff options
author | Nick White <git@njw.me.uk> | 2013-05-26 21:52:41 +0100 |
---|---|---|
committer | Nick White <git@njw.me.uk> | 2013-05-26 21:52:41 +0100 |
commit | 5d0c5974e2655a6f66153bffb67c9346c2c4a589 (patch) | |
tree | 5f0f2d773dda4bb5f7078c0971cc92cf5c46000d /htmtojargon.awk | |
parent | b0055fe870a04fbd8eaef669c1ccfb0febfe8bc3 (diff) | |
download | oed2dict-5d0c5974e2655a6f66153bffb67c9346c2c4a589.tar.bz2 oed2dict-5d0c5974e2655a6f66153bffb67c9346c2c4a589.zip |
Correct e tag removal, correct headword separation
Diffstat (limited to 'htmtojargon.awk')
-rw-r--r-- | htmtojargon.awk | 5 |
1 files changed, 5 insertions, 0 deletions
diff --git a/htmtojargon.awk b/htmtojargon.awk index 16c5356..323b7ee 100644 --- a/htmtojargon.awk +++ b/htmtojargon.awk @@ -1,5 +1,6 @@ #!/usr/bin/awk -f # dirty xml reading is more fun +# requires nawk / gawk for sub() BEGIN { FS = "</hg>" @@ -8,6 +9,10 @@ BEGIN { { printdefs($1, $2); } function printdefs(word, defs) { + # split the headwords and other information about the word, + # to be used in conjuction with dictfmt's --index-data-separator + sub("<i>", " ", word); + n = split(defs, array, "<def>"); for(i=0; i<=n; i++) { if(array[i] != "") { |