Correct e tag removal, correct headword separation

author: Nick White <git@njw.me.uk> 2013-05-26 21:52:41 +0100
committer: Nick White <git@njw.me.uk> 2013-05-26 21:52:41 +0100
commit: 5d0c5974e2655a6f66153bffb67c9346c2c4a589 (patch)
tree: 5f0f2d773dda4bb5f7078c0971cc92cf5c46000d /htmtojargon.awk
parent: b0055fe870a04fbd8eaef669c1ccfb0febfe8bc3 (diff)
download: oed2dict-5d0c5974e2655a6f66153bffb67c9346c2c4a589.tar.bz2
oed2dict-5d0c5974e2655a6f66153bffb67c9346c2c4a589.zip
1 files changed, 5 insertions, 0 deletions
diff --git a/htmtojargon.awk b/htmtojargon.awk
index 16c5356..323b7ee 100644
--- a/htmtojargon.awk
+++ b/htmtojargon.awk
@@ -1,5 +1,6 @@
 #!/usr/bin/awk -f
 # dirty xml reading is more fun
+# requires nawk / gawk for sub()
 
 BEGIN {
 	FS = "</hg>"
@@ -8,6 +9,10 @@ BEGIN {
 { printdefs($1, $2); }
 
 function printdefs(word, defs) {
+	# split the headwords and other information about the word,
+	# to be used in conjuction with dictfmt's --index-data-separator
+	sub("<i>", "	", word);
+
 	n = split(defs, array, "<def>");
 	for(i=0; i<=n; i++) {
 		if(array[i] != "") {
author	Nick White <git@njw.me.uk>	2013-05-26 21:52:41 +0100
committer	Nick White <git@njw.me.uk>	2013-05-26 21:52:41 +0100
commit	5d0c5974e2655a6f66153bffb67c9346c2c4a589 (patch)
tree	5f0f2d773dda4bb5f7078c0971cc92cf5c46000d /htmtojargon.awk
parent	b0055fe870a04fbd8eaef669c1ccfb0febfe8bc3 (diff)
download	oed2dict-5d0c5974e2655a6f66153bffb67c9346c2c4a589.tar.bz2 oed2dict-5d0c5974e2655a6f66153bffb67c9346c2c4a589.zip