summaryrefslogtreecommitdiff
path: root/htmtojargon.awk
diff options
context:
space:
mode:
authorNick White <git@njw.me.uk>2013-05-26 21:52:41 +0100
committerNick White <git@njw.me.uk>2013-05-26 21:52:41 +0100
commit5d0c5974e2655a6f66153bffb67c9346c2c4a589 (patch)
tree5f0f2d773dda4bb5f7078c0971cc92cf5c46000d /htmtojargon.awk
parentb0055fe870a04fbd8eaef669c1ccfb0febfe8bc3 (diff)
downloadoed2dict-5d0c5974e2655a6f66153bffb67c9346c2c4a589.tar.bz2
oed2dict-5d0c5974e2655a6f66153bffb67c9346c2c4a589.zip
Correct e tag removal, correct headword separation
Diffstat (limited to 'htmtojargon.awk')
-rw-r--r--htmtojargon.awk5
1 files changed, 5 insertions, 0 deletions
diff --git a/htmtojargon.awk b/htmtojargon.awk
index 16c5356..323b7ee 100644
--- a/htmtojargon.awk
+++ b/htmtojargon.awk
@@ -1,5 +1,6 @@
#!/usr/bin/awk -f
# dirty xml reading is more fun
+# requires nawk / gawk for sub()
BEGIN {
FS = "</hg>"
@@ -8,6 +9,10 @@ BEGIN {
{ printdefs($1, $2); }
function printdefs(word, defs) {
+ # split the headwords and other information about the word,
+ # to be used in conjuction with dictfmt's --index-data-separator
+ sub("<i>", " ", word);
+
n = split(defs, array, "<def>");
for(i=0; i<=n; i++) {
if(array[i] != "") {