From 5d0c5974e2655a6f66153bffb67c9346c2c4a589 Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.me.uk>
Date: Sun, 26 May 2013 21:52:41 +0100
Subject: Correct e tag removal, correct headword separation

---
 BUGS                 | 6 ------
 Makefile             | 5 +++--
 htmtojargon.awk      | 5 +++++
 removeetags.pl       | 4 ++++
 separateheadmeta.sed | 2 --
 xmlcleanup2.sed      | 6 ------
 6 files changed, 12 insertions(+), 16 deletions(-)
 create mode 100644 removeetags.pl
 delete mode 100644 separateheadmeta.sed
diff --git a/BUGS b/BUGS
index 1426aab..7358117 100644
--- a/BUGS
+++ b/BUGS
@@ -1,9 +1,3 @@
-There is a bug causing some words to not be included at all. Cowboy and chicken are examples.
-
-Word type (noun, adj, verb, etc.) markings after headword are included in the index, so searching for e.g. "A" or "n" is impossible as it returns all adjectives or nouns.
-
 Many of the IPA and other non-ascii symbols aren't yet converted to unicode, and are just discarded. Many (but not all) of these symbols are documented in files in Help/ from the original CDs.
 
 Search results are not necessarily returned in order. This is probably unavoidable with the dict protocol.
-
-grep TODO symbols *sed *awk Makefile
diff --git a/Makefile b/Makefile
index 10c4fb3..cfb4484 100644
--- a/Makefile
+++ b/Makefile
@@ -11,9 +11,10 @@ all: oed.jargon oed.dict.dz
 
 .htm.jargon:
 	iconv -f ISO-8859-1 -t UTF-8 < "$<" \
+	| sed 's/\r//g' \
 	| awk -f htmtojargon.awk \
-	| sed -f separateheadmeta.sed \
 	| sed -f symbols.sed \
+	| perl -p removeetags.pl \
 	| sed -f xmlcleanup.sed \
 	| sed -f xmlcleanup2.sed \
 	> "$@"
@@ -28,7 +29,7 @@ oed.dict: oed.jargon
 	cat $< \
 	| dictfmt -j --utf8 \
 	  --columns 0 --headword-separator ',' \
-	  --index-data-separator ';' \
+	  --index-data-separator "	" \
 	  -u http://njw.me.uk/oed \
 	  -s "Oxford English Dictionary, 2nd Edition" \
 	  oed
diff --git a/htmtojargon.awk b/htmtojargon.awk
index 16c5356..323b7ee 100644
--- a/htmtojargon.awk
+++ b/htmtojargon.awk
@@ -1,5 +1,6 @@
 #!/usr/bin/awk -f
 # dirty xml reading is more fun
+# requires nawk / gawk for sub()
 
 BEGIN {
 	FS = "</hg>"
@@ -8,6 +9,10 @@ BEGIN {
 { printdefs($1, $2); }
 
 function printdefs(word, defs) {
+	# split the headwords and other information about the word,
+	# to be used in conjuction with dictfmt's --index-data-separator
+	sub("<i>", "	", word);
+
 	n = split(defs, array, "<def>");
 	for(i=0; i<=n; i++) {
 		if(array[i] != "") {
diff --git a/removeetags.pl b/removeetags.pl
new file mode 100644
index 0000000..8de7875
--- /dev/null
+++ b/removeetags.pl
@@ -0,0 +1,4 @@
+# <e> tags seem to duplicate other preceeding tags, so remove them
+#
+# sed doesn't support non-greedy matching, so we're using perl
+s/<e>.*?<\/e>//g
diff --git a/separateheadmeta.sed b/separateheadmeta.sed
deleted file mode 100644
index 9dca6c6..0000000
--- a/separateheadmeta.sed
+++ /dev/null
@@ -1,2 +0,0 @@
-# this replaces the first <i> in a headword with "; "
-/^:[^:]*:/ s/<i>/; /
diff --git a/xmlcleanup2.sed b/xmlcleanup2.sed
index 5352a92..56e66b0 100644
--- a/xmlcleanup2.sed
+++ b/xmlcleanup2.sed
@@ -1,12 +1,6 @@
 #!/bin/sed -f
 # sed rules that must run after the main set
 
-# <e> tags seem to duplicate <v> tags for some quotes
-# NOTE this assumes e tags only come at end of lines
-s/<e>.*$//g
-# TODO: make this work instead of the above
-#s/<e>.*?<\/e>//g
-
 # any xml tags not processed can just go away
 s/<[^>]*>//g
 # any xml character entities not processed can just go away
-- 
cgit v1.2.3