// ===========================================================================
//
//                            PUBLIC DOMAIN NOTICE
//            National Center for Biotechnology Information (NCBI)
//
//  This software/database is a "United States Government Work" under the
//  terms of the United States Copyright Act. It was written as part of
//  the author's official duties as a United States Government employee and
//  thus cannot be copyrighted. This software/database is freely available
//  to the public for use. The National Library of Medicine and the U.S.
//  Government do not place any restriction on its use or reproduction.
//  We would, however, appreciate having the NCBI and the author cited in
//  any work or product based on this material.
//
//  Although all reasonable efforts have been taken to ensure the accuracy
//  and reliability of the software and data, the NLM and the U.S.
//  Government do not and cannot warrant the performance or results that
//  may be obtained by using this software or data. The NLM and the U.S.
//  Government disclaim all warranties, express or implied, including
//  warranties of performance, merchantability or fitness for any particular
//  purpose.
//
// ===========================================================================
//
// File Name:  xtract.go
//
// Author:  Jonathan Kans
//
// ==========================================================================

package main

import (
	"bufio"
	"bytes"
	"fmt"
	"html"
	"io"
	"os"
	"strconv"
	"strings"
	"unicode"
)

// VERSION AND HELP MESSAGE TEXT

const xtract_version = "3.30"

const xtract_help = `
Exploration Argument Hierarchy

  -pattern        Name of record within set
  -group            Use of different argument
  -block              names allows command-line
  -subset               control of nested looping

Conditional Execution

  -position       Must be at given location in list
  -match          Element [@attribute] [:value] required
  -avoid          Skip if element matches
  -and            All tests must pass
  -or             Any passing test suffices

Format Customization

  -ret            Override line break between patterns
  -tab            Replace tab character between fields
  -sep            Separator between group members
  -pfx            Prefix to print before group
  -sfx            Suffix to print after group
  -lbl            Insert arbitrary text

Item Selection

  -element        Print all items that match tag name
  -first          Only print value of first item
  -last           Only print value of last item
  -NAME           Record value in named variable

-element Constructs

  Tag             Caption
  Group           Initials,LastName
  Parent/Child    MedlineCitation/PMID
  Attribute       DescriptorName@MajorTopicYN
  Object Count    "#Author"
  Item Length     "%Title"
  Variable        "&NAME"

Exploration Constructs

  Object          DateCreated
  Parent/Child    Book/AuthorList
  Heterogeneous   PubmedArticleSet/*
  Recursive       */Taxon

Command Generator

  -insd           Generate INSDSeq extraction commands

XML Formatting

  -format         Repair XML format and indentation
  -outline        Display outline of XML structure
  -synopsis       Display count of unique XML paths

Documentation

  -examples       Print examples of EDirect and xtract usage
`

const xtract_undocumented = `
XML Processing

  -cleanup        Fix non-ASCII spaces
  -compress       Compress runs of spaces
  -repair         Cleanup and compress tags, not contents

String -match Constraints

  -contains       Substring must be present

Numeric -match Constraints

  -gt             Greater than
  -ge             Greater than or equal to
  -lt             Less than
  -le             Less than or equal to
  -eq             Equal to
  -ne             Not equal to

Format Customization

  -clr            Clear queued tab separator
  -rst            Reset -sep, -pfx, and -sfx

Numeric Selection

  -sum            Sum
  -min            Minimum
  -max            Maximum

Miscellaneous Arguments

  -verify         Verify integrity of XML

Examples

  xtract -pattern DocumentSummary -match "%Title" -le 70 -and "#Author" -lt 6

  xtract -pattern PubmedArticle -match "DateCreated/Year" -ne "DateRevised/Year"

  xtract -pattern Taxon -match GenbankCommonName -contains mouse

  xtract -pattern INSDSeq -block INSDReference -position 2

  xtract -pattern GenomicInfoType -element ChrAccVer ChrStart ChrStop -min ChrStart,ChrStop

  xtract -insd CDS,mRNA INSDFeature_key "#INSDInterval" product transcription translation
`

const xtract_examples = `
Publications

  efetch -db pubmed -id 6271474,5685784,4882854,6243420 -format xml | \
  xtract -pattern PubmedArticle -element MedlineCitation/PMID "#Author" \
    -block Author -position first -sep " " -element Initials,LastName \
    -block Article -element ArticleTitle

  6271474    5    MJ Casadaban     Tn3: transposition and control.
  5685784    2    RK Mortimer      Suppressors and suppressible mutations in yeast.
  4882854    2    ED Garber        Proteins and enzymes as taxonomic tools.
  6243420    1    NR Cozzarelli    DNA gyrase and the supercoiling of DNA.

Peptide Sequences

  esearch -db protein -query "conotoxin AND mat_peptide [FKEY]" | \
  efetch -format gpc | \
  xtract -insd complete mat_peptide "%peptide" product peptide | \
  grep -i conotoxin | sort -t $'\t' -u -k 2,2n | head -n 8

  ADB43131.1    15    conotoxin Cal 1b     LCCKRHHGCHPCGRT
  AIC77099.1    16    conotoxin Im1.2      GCCSHPACNVNNPHIC
  AIC77105.1    17    conotoxin Lt1.4      GCCSHPACDVNNPDICG
  AIC77103.1    18    conotoxin Lt1.2      PRCCSNPACNANHAEICG
  AIC77083.1    20    conotoxin Bt14.6     KDCTYCMHSSCSMMYEKCRP
  AIC77085.1    21    conotoxin Bt14.8     NECDNCMRSFCSMIYEKCRLK
  AIC77093.1    22    conotoxin Bt14.16    GDCKPCMHPDCRFNPGRCRPRE
  AIC77154.1    23    conotoxin Bt14.19    VREKDCPPHPVPGMHKCVCLKTC

Chromosome Locations

  esearch -db gene -query "calmodulin [PFN] AND mammalia [ORGN]" | \
  efetch -format docsum | \
  xtract -pattern DocumentSummary -MAP "(-)" -MAP MapLocation \
    -element Id Name "&MAP" ScientificName

  801       CALM1    14q32.11         Homo sapiens
  808       CALM3    19q13.2-q13.3    Homo sapiens
  805       CALM2    2p21             Homo sapiens
  24242     Calm1    6q31-q32         Rattus norvegicus
  12313     Calm1    12 E             Mus musculus
  326597    CALM     -                Bos taurus
  50663     Calm2    6q11-q12         Rattus norvegicus
  24244     Calm3    1q22             Rattus norvegicus
  12315     Calm3    7 9.15 cM        Mus musculus
  12314     Calm2    17 E4            Mus musculus
  617095    CALM1    -                Bos taurus
  396838    CALM3    6                Sus scrofa
  ...

Gene Regions

  esearch -db gene -query "recombination activating gene [TITL]" | \
  efilter -query "human [ORGN]" | efetch -format docsum | \
  xtract -pattern GenomicInfoType -element ChrAccVer ChrStart ChrStop | \
  xargs -n 3 sh -c 'efetch -db nuccore -format gb \
    -id "$0" -chr_start "$1" -chr_stop "$2"'

  LOCUS       NC_000011              69363 bp    DNA     linear   CON 12-MAR-2015
  DEFINITION  Homo sapiens chromosome 11, GRCh38.p2 Primary Assembly.
  ACCESSION   NC_000011 REGION: 36510400..36579762 GPC_000001303
  VERSION     NC_000011.10  GI:568815587
  ...
  FEATURES             Location/Qualifiers
       source          1..69363
                       /organism="Homo sapiens"
                       /mol_type="genomic DNA"
                       /db_xref="taxon:9606"
                       /chromosome="11"
       gene            1..69363
                       /gene="RAG1"
       mRNA            join(1..138,52986..53099,62892..67333)
                       /gene="RAG1"
                       /product="recombination activating gene 1, transcript
                       variant X1"
                       /transcript_id="XM_011520250.1"
       CDS             62906..66037
                       /gene="RAG1"
                       /codon_start=1
                       /product="V(D)J recombination-activating protein 1 isoform
                       X1"
                       /protein_id="XP_005253098.1"
                       /translation="MAASFPPTLGLSSAPDEIQHPHIKFSEWKFKLFRVRSFEKTPEE
                       AQKEKKDSFEGKPSLEQSPAVLDKADGQKPVPTQPLLKAHPKFSKKFHDNEKARGKAI
                       HQANLRHLCRICGNSFRADEHNRRYPVHGPVDGKTLGLLRKKEKRATSWPDLIAKVFR
                       IDVKADVDSIHPTEFCHNCWSIMHRKFSSAPCEVYFPRNVTMEWHPHTPSCDICNTAR
                       ...

Taxonomic Names

  esearch -db taxonomy -query "txid10090 [SBTR] OR camel [COMN]" | \
  efetch -format docsum | \
  xtract -pattern DocumentSummary -match CommonName \
    -element Id ScientificName CommonName

  57486    Mus musculus molossinus    Japanese wild mouse
  39442    Mus musculus musculus      eastern European house mouse
  35531    Mus musculus bactrianus    southwestern Asian house mouse
  10092    Mus musculus domesticus    western European house mouse
  10091    Mus musculus castaneus     southeastern Asian house mouse
  10090    Mus musculus               house mouse
  9838     Camelus dromedarius        Arabian camel
  9837     Camelus bactrianus         Bactrian camel

Structural Similarity

  esearch -db structure -query "crotalus [ORGN] AND phospholipase A2" | \
  elink -related | efilter -query "archaea [ORGN]" | efetch -format docsum | \
  xtract -pattern DocumentSummary -match "PdbClass:Hydrolase" \
    -element PdbAcc PdbDescr

  3VV2    Crystal Structure Of Complex Form Between S324a-subtilisin And Mutant Tkpro
  3VHQ    Crystal Structure Of The Ca6 Site Mutant Of Pro-Sa-Subtilisin
  2ZWP    Crystal Structure Of Ca3 Site Mutant Of Pro-S324a
  2ZWO    Crystal Structure Of Ca2 Site Mutant Of Pro-S324a
  ...

Indexed Fields

  einfo -db pubmed | \
  xtract -pattern Field -match "IsDate:Y" -and "IsHidden:N" \
    -pfx "[" -sep "]\t" -element Name,FullName | \
  sort -t $'\t' -k 2f

  [CDAT]    Date - Completion
  [CRDT]    Date - Create
  [EDAT]    Date - Entrez
  [MHDA]    Date - MeSH
  [MDAT]    Date - Modification
  [PDAT]    Date - Publication

Record Counts

  echo "diphtheria measles pertussis polio tuberculosis" | \
  xargs -n 1 sh -c 'esearch -db pubmed -query "$0 [MESH]" |
  efilter -days 365 -datetype PDAT |
  xtract -pattern ENTREZ_DIRECT -lbl "$0" -element Count'

  diphtheria      18
  measles         166
  pertussis       98
  polio           75
  tuberculosis    1386
`

// DATA OBJECTS

type Attrib struct {
	Name  string
	Value string
}

type Node struct {
	Name       string
	Parent     string
	Contents   string
	Attributes []Attrib
	Children   []*Node
}

type Block struct {
	Visit      string
	Position   string
	Conditions []string
	Commands   []string
	Subtasks   []*Block
}

// UTILITIES

func IsNotJustWhitespace(str string) bool {

	for _, ch := range str {
		if ch != ' ' && ch != '\t' && ch != '\r' && ch != '\n' {
			return true
		}
	}

	return false
}

func IsAllCapsOrDigits(str string) bool {

	for _, rune := range str {
		if !unicode.IsUpper(rune) && !unicode.IsDigit(rune) {
			return false
		}
	}

	return true
}

func CompressRunsOfSpaces(str string) string {

	whiteSpace := false
	var buffer bytes.Buffer

	for _, rune := range str {
		if unicode.IsSpace(rune) {
			if !whiteSpace {
				buffer.WriteRune(' ')
			}
			whiteSpace = true
		} else {
			buffer.WriteRune(rune)
			whiteSpace = false
		}
	}

	return buffer.String()
}

func HasBadSpace(str string) bool {

	for _, rune := range str {
		if unicode.IsSpace(rune) && rune != ' ' {
			return true
		}
	}

	return false
}

func CleanupBadSpaces(str string) string {

	var buffer bytes.Buffer

	for _, rune := range str {
		if unicode.IsSpace(rune) {
			buffer.WriteRune(' ')
		} else {
			buffer.WriteRune(rune)
		}
	}

	return buffer.String()
}

const (
	_ = iota
	LEFT
	RIGHT
)

func SplitInTwoAt(str, chr string, side int) (string, string) {

	slash := strings.SplitN(str, chr, 2)
	if len(slash) > 1 {
		return slash[0], slash[1]
	}

	if side == LEFT {
		return str, ""
	}

	return "", str
}

const (
	UNSET = iota
	ELEMENT
	FIRST
	LAST
	PFX
	SFX
	SEP
	TAB
	RET
	LBL
	CLR
	RST
	MATCH
	AVOID
	AND
	OR
	CONTAINS
	GT
	GE
	LT
	LE
	EQ
	NE
	SUM
	MIN
	MAX
	XML
	DOCTYPE
	VARIABLE
	UNRECOGNIZED
)

func ParseFlag(str string) int {

	switch str {
	case "-element":
		return ELEMENT
	case "-first":
		return FIRST
	case "-last":
		return LAST
	case "-pfx":
		return PFX
	case "-sfx":
		return SFX
	case "-sep":
		return SEP
	case "-tab":
		return TAB
	case "-ret":
		return RET
	case "-lbl":
		return LBL
	case "-clr":
		return CLR
	case "-rst":
		return RST
	case "-match":
		return MATCH
	case "-avoid":
		return AVOID
	case "-and":
		return AND
	case "-or":
		return OR
	case "-contains":
		return CONTAINS
	case "-gt":
		return GT
	case "-ge":
		return GE
	case "-lt":
		return LT
	case "-le":
		return LE
	case "-eq":
		return EQ
	case "-ne":
		return NE
	case "-sum":
		return SUM
	case "-min":
		return MIN
	case "-max":
		return MAX
	case "-xml":
		return XML
	case "-doctype":
		return DOCTYPE
	default:
		if len(str) > 1 && str[0] == '-' && IsAllCapsOrDigits(str[1:]) {
			return VARIABLE
		}
	}

	if len(str) > 0 && str[0] == '-' {
		return UNRECOGNIZED
	}

	return UNSET
}

func ConvertSlash(str string) string {

	if str == "" {
		return str
	}

	len := len(str)
	res := make([]byte, len+1, len+1)

	isSlash := false
	idx := 0
	for _, rune := range str {
		if isSlash {
			switch rune {
			case 'n':
				res[idx] = '\n'
			case 'r':
				res[idx] = '\r'
			case 't':
				res[idx] = '\t'
			default:
				res[idx] = byte(rune)
			}
			idx++
			isSlash = false
		} else if rune == '\\' {
			isSlash = true
		} else {
			res[idx] = byte(rune)
			idx++
		}
	}

	res = res[0:idx]

	return string(res)
}

// CONVERT XML INPUT DATA TO NORMALIZED CHANNEL OF TOKENS

const (
	_ = iota
	XML_TAG
	DOC_TAG
	START_TAG
	ATTRIBUTES
	CONTENTS
	END_TAG
	SENTINEL
)

const BUFFERSIZE = 128

type XmlReader struct {
	Reader     *bufio.Reader
	Categories []int
	Values     []string
	Avail      int
	Next       int
	Full       int
	Closed     bool
	Docompress bool
	Docleanup  bool
	Dorepair   bool
}

func NewXmlReader(in *bufio.Reader, doCompress, doCleanup, doRepair bool) *XmlReader {

	if in == nil {
		return nil
	}

	rdr := &XmlReader{Reader: in, Full: BUFFERSIZE, Docompress: doCompress, Docleanup: doCleanup, Dorepair: doRepair}

	// XML split at ">" characters may contain several tokens per ReadString call
	size := BUFFERSIZE + 6

	rdr.Categories = make([]int, size)
	rdr.Values = make([]string, size)

	return rdr
}

func NextToken(rdr *XmlReader) (int, string, bool) {

	if rdr == nil {
		return SENTINEL, "NO XmlReader", false
	}
	if rdr.Closed {
		return SENTINEL, "END OF DATA", false
	}

	// function to read XML, parse tokens into buffer
	readBuffer := func() {

		rdr.Avail = 0
		rdr.Next = 0

		// function to add one token to buffer
		addToken := func(cat int, val string) {

			rdr.Categories[rdr.Avail] = cat
			rdr.Values[rdr.Avail] = val
			rdr.Avail++
		}

		// loop until buffer is full or end of file is reached
		for {

			if rdr.Avail >= rdr.Full {
				break
			}

			// reading until next ">" character turns out to be significantly faster than using encoding/xml.Decoder
			in := rdr.Reader
			line, err := in.ReadString('>')
			if err != nil {
				// report end of file with sentinel token
				addToken(SENTINEL, "END OF DATA")
				break
			}

			line = strings.TrimSpace(line)
			// optionally compress/cleanup both tags/attributes and contents
			if rdr.Docompress {
				line = CompressRunsOfSpaces(line)
			}
			if rdr.Docleanup {
				if HasBadSpace(line) {
					line = CleanupBadSpaces(line)
				}
			}

			if line == "" {
				// skip blank lines
				continue
			}

			content, object := SplitInTwoAt(line, "<", LEFT)

			if content != "" {
				// send contents
				addToken(CONTENTS, content)
			}

			if object == "" {
				continue
			}
			length := len(object)
			if length < 2 {
				continue
			}
			if object[length-1] != '>' {
				continue
			}
			// split function already removed left angle bracket, now remove right angle bracket
			object = object[0 : length-1]

			// optionally compress/cleanup only tags/attributes split on multiple lines
			if rdr.Dorepair {
				if HasBadSpace(object) {
					object = CleanupBadSpaces(object)
				}
				object = CompressRunsOfSpaces(object)
			}

			length = len(object)
			if length < 1 {
				continue
			}

			// check for special constructs, ignoring processing instructions and comments
			if object[0] == '?' {
				if strings.HasPrefix(object, "?xml") {
					addToken(XML_TAG, object)
				}
				continue
			}
			if object[0] == '!' {
				if strings.HasPrefix(object, "!DOCTYPE") {
					addToken(DOC_TAG, object)
				}
				continue
			}

			// check for self-closing tag
			if object[length-1] == '/' {

				str := object[0 : length-1]

				tag, attr := SplitInTwoAt(str, " ", LEFT)
				if tag == "" {
					continue
				}

				addToken(START_TAG, tag)

				if attr != "" {
					addToken(ATTRIBUTES, attr)
				}

				// normalize by sending separate end object
				addToken(END_TAG, tag)

				continue
			}

			// check for end tag
			if object[0] == '/' {

				str := object[1:]

				addToken(END_TAG, str)

				continue
			}

			// object must be start tag
			str := object

			tag, attr := SplitInTwoAt(str, " ", LEFT)
			if tag == "" {
				continue
			}

			addToken(START_TAG, tag)

			if attr != "" {
				addToken(ATTRIBUTES, attr)
			}
		}
	}

	if rdr.Avail < 1 {
		readBuffer()
	}

	if rdr.Avail < 1 {
		rdr.Closed = true
		return SENTINEL, "ERROR", false
	}

	cat := rdr.Categories[rdr.Next]
	val := rdr.Values[rdr.Next]
	rdr.Next++
	rdr.Avail--

	if cat == SENTINEL {
		rdr.Closed = true
	}

	return cat, val, true
}

// MISCELLANEOUS FORMATTING FUNCTIONS

// reformat XML for ease of reading
func ProcessFormat(args []string, doCompress, doCleanup, doRepair bool) {

	const (
		NOTSET = iota
		START
		ATTRS
		END
		CHAR
		OTHER
	)

	xmlx := ""
	doctype := ""

	pfx := " "
	sfx := " "

	status := UNSET
	for _, str := range args {
		switch status {
		case UNSET:
			status = ParseFlag(str)
		case PFX:
			pfx = ConvertSlash(str)
			status = UNSET
		case SFX:
			sfx = ConvertSlash(str)
			status = UNSET
		case XML:
			xmlx = ConvertSlash(str)
			status = UNSET
		case DOCTYPE:
			doctype = ConvertSlash(str)
			status = UNSET
		default:
		}
	}

	pRdr := bytes.NewReader([]byte(pfx))
	sRdr := bytes.NewReader([]byte(sfx))

	combined := io.MultiReader(pRdr, os.Stdin, sRdr)

	xmlRdr := bufio.NewReader(combined)
	if xmlRdr == nil {
		return
	}

	rdr := NewXmlReader(xmlRdr, doCompress, doCleanup, doRepair)
	if rdr == nil {
		return
	}

	if xmlx != "" {
		fmt.Printf("%s\n", xmlx)
	}

	if doctype != "" {
		fmt.Printf("%s\n", doctype)
	}

	indent := 0

	status = NOTSET
	needsRightBracket := false
	for {
		cat, val, ok := NextToken(rdr)
		if !ok {
			break
		}
		switch cat {
		case XML_TAG:
			if xmlx == "" {
				fmt.Printf("<%s>\n", val)
			}
			status = NOTSET
		case DOC_TAG:
			if doctype == "" {
				fmt.Printf("<%s>\n", val)
			}
			status = NOTSET
		case START_TAG:
			if needsRightBracket {
				fmt.Printf(">")
				needsRightBracket = false
			}
			if status == START || status == ATTRS {
				fmt.Printf("\n")
			}
			for i := 0; i < indent; i++ {
				fmt.Printf("  ")
			}
			fmt.Printf("<%s", val)
			indent++
			status = START
			needsRightBracket = true
		case ATTRIBUTES:
			attributes := val
			for attributes != "" {
				tag, rem := SplitInTwoAt(attributes, "=\"", LEFT)
				val, nxt := SplitInTwoAt(rem, "\"", LEFT)
				attributes = strings.TrimSpace(nxt)
				fmt.Printf(" %s=\"%s\"", tag, val)
			}
			status = ATTRS
		case END_TAG:
			if needsRightBracket {
				fmt.Printf(">")
				needsRightBracket = false
			}
			indent--
			if status == CHAR {
				fmt.Printf("</%s>\n", val)
			} else if status == START || status == ATTRS {
				fmt.Printf("</%s>\n", val)
			} else {
				for i := 0; i < indent; i++ {
					fmt.Printf("  ")
				}
				fmt.Printf("</%s>\n", val)
			}
			status = END
		case CONTENTS:
			if needsRightBracket {
				fmt.Printf(">")
				needsRightBracket = false
			}
			str := html.UnescapeString(val)
			if len(str) > 0 && IsNotJustWhitespace(str) {
				str = html.EscapeString(str)
				fmt.Printf("%s", str)
				status = CHAR
			}
		case SENTINEL:
			break
		default:
			if needsRightBracket {
				fmt.Printf(">")
				needsRightBracket = false
			}
			status = OTHER
		}
	}
}

// display outline of XML structure
func ProcessOutline(doCompress, doCleanup, doRepair bool) {

	xmlRdr := bufio.NewReader(os.Stdin)
	if xmlRdr == nil {
		return
	}

	rdr := NewXmlReader(xmlRdr, doCompress, doCleanup, doRepair)
	if rdr == nil {
		return
	}

	indent := 0

	for {
		cat, val, ok := NextToken(rdr)
		if !ok {
			break
		}
		switch cat {
		case START_TAG:
			if val == "eSummaryResult" ||
				val == "eLinkResult" ||
				val == "eInfoResult" ||
				val == "PubmedArticleSet" ||
				val == "DocumentSummarySet" ||
				val == "INSDSet" ||
				val == "Entrezgene-Set" ||
				val == "TaxaSet" {
				continue
			}
			for i := 0; i < indent; i++ {
				fmt.Printf("  ")
			}
			fmt.Printf("%s\n", val)
			indent++
		case END_TAG:
			indent--
		case SENTINEL:
			break
		default:
		}
	}
}

// display paths to XML elements
func SynopsisLevel(parent string, rdr *XmlReader) {

	for {
		cat, val, ok := NextToken(rdr)
		if !ok {
			break
		}
		switch cat {
		case START_TAG:
			if val == "eSummaryResult" ||
				val == "eLinkResult" ||
				val == "eInfoResult" ||
				val == "PubmedArticleSet" ||
				val == "DocumentSummarySet" ||
				val == "INSDSet" ||
				val == "Entrezgene-Set" ||
				val == "TaxaSet" {
				continue
			}
			if parent != "" {
				fmt.Printf("%s/", parent)
			}
			fmt.Printf("%s\n", val)
			path := parent
			if path != "" {
				path += "/"
			}
			path += val
			SynopsisLevel(path, rdr)
		case END_TAG:
			// break recursion
			return
		case SENTINEL:
			break
		default:
		}
	}
}

func ProcessSynopsis(doCompress, doCleanup, doRepair bool) {

	xmlRdr := bufio.NewReader(os.Stdin)
	if xmlRdr == nil {
		return
	}

	rdr := NewXmlReader(xmlRdr, doCompress, doCleanup, doRepair)
	if rdr == nil {
		return
	}

	SynopsisLevel("", rdr)
}

// verify integrity of XML object nesting
func VerifyLevel(parent string, level int, rdr *XmlReader) {

	const (
		NOTSET = iota
		START
		ATTRS
		END
		CHAR
		OTHER
	)

	status := START
	for {
		cat, val, ok := NextToken(rdr)
		if !ok {
			break
		}
		switch cat {
		case START_TAG:
			if status == CHAR {
				fmt.Printf("<%s> not expected after contents\n", val)
			}
			VerifyLevel(val, level+1, rdr)
			// returns here after recursion
			status = END
		case ATTRIBUTES:
			if status != START {
				fmt.Printf("Attributes do not follow start\n")
			}
			status = ATTRS
		case END_TAG:
			if parent != val && parent != "" {
				fmt.Printf("Expected </%s>, found </%s>\n", parent, val)
			}
			if level < 1 {
				fmt.Printf("Unexpected </%s> at end of XML\n", val)
			}
			// break recursion
			return
		case CONTENTS:
			if status != START && status != ATTRS {
				fmt.Printf("Contents not expected before </%s>\n", parent)
			}
			status = CHAR
		case SENTINEL:
			if level > 0 {
				fmt.Printf("Unexpected end of data\n")
			}
			break
		default:
			status = OTHER
		}
	}
}

func ProcessVerify(doCompress, doCleanup, doRepair bool) {

	xmlRdr := bufio.NewReader(os.Stdin)
	if xmlRdr == nil {
		return
	}

	rdr := NewXmlReader(xmlRdr, doCompress, doCleanup, doRepair)
	if rdr == nil {
		return
	}

	VerifyLevel("", 0, rdr)
}

// INSDSEQ EXTRACTION COMMAND GENERATOR

// e.g., xtract -insd complete mat_peptide "%peptide" product peptide

func ProcessINSD(args []string, isPipe bool) []string {

	var acc []string

	max := len(args)
	if max < 1 {
		return acc
	}

	// collect descriptors

	if strings.HasPrefix(args[0], "INSD") {

		acc = append(acc, "-pattern", "INSDSeq", "-ACCN", "INSDSeq_accession-version")

		if isPipe {
			acc = append(acc, "-pfx", "\\n", "-element", "&ACCN")
			acc = append(acc, "-group", "INSDSeq", "-sep", "|", "-element")
		} else {
			acc = append(acc, "-pfx", "\"\\n\"", "-element", "\"&ACCN\"")
			acc = append(acc, "-group", "INSDSeq", "-sep", "\"|\"", "-element")
		}

		for _, str := range args {
			acc = append(acc, str)
		}

		return acc
	}

	// collect qualifiers

	if max < 2 {
		return acc
	}

	partial := false
	complete := false

	if args[0] == "+" || args[0] == "complete" {
		complete = true
		args = args[1:]
		max--
	} else if args[0] == "-" || args[0] == "partial" {
		partial = true
		args = args[1:]
		max--
	}
	if max < 2 {
		return acc
	}

	acc = append(acc, "-pattern", "INSDSeq", "-ACCN", "INSDSeq_accession-version")

	acc = append(acc, "-group", "INSDFeature")

	// limit to designated features

	feature := args[0]
	args = args[1:]

	fcmd := "-match"

	// can specify multiple features separated by plus sign (e.g., CDS+mRNA) or comma (e.g., CDS,mRNA) (undocumented)
	plus := strings.Split(feature, "+")
	for _, pls := range plus {
		comma := strings.Split(pls, ",")
		for _, cma := range comma {

			acc = append(acc, fcmd)
			ft := fmt.Sprintf("INSDFeature_key:%s", cma)
			acc = append(acc, ft)

			fcmd = "-or"
		}
	}

	if complete {
		acc = append(acc, "-avoid", "INSDFeature_partial5", "-and", "INSDFeature_partial3")
	} else if partial {
		acc = append(acc, "-match", "INSDFeature_partial5", "-or", "INSDFeature_partial3")
	}

	if isPipe {
		acc = append(acc, "-pfx", "\\n", "-element", "&ACCN")
	} else {
		acc = append(acc, "-pfx", "\"\\n\"", "-element", "\"&ACCN\"")
	}

	for _, str := range args {
		if strings.HasPrefix(str, "INSD") {

			if isPipe {
				acc = append(acc, "-block", "INSDFeature", "-sep", "|", "-element")
			} else {
				acc = append(acc, "-block", "INSDFeature", "-sep", "\"|\"", "-element")
			}
			acc = append(acc, str)

		} else if strings.HasPrefix(str, "#INSD") {

			if isPipe {
				acc = append(acc, "-block", "INSDFeature", "-sep", "|", "-element")
				acc = append(acc, str)
			} else {
				acc = append(acc, "-block", "INSDFeature", "-sep", "\"|\"", "-element")
				ql := fmt.Sprintf("\"%s\"", str)
				acc = append(acc, ql)
			}

		} else {

			acc = append(acc, "-block", "INSDQualifier")

			if len(str) > 2 && str[0] == '%' {
				acc = append(acc, "-match")
				ql := fmt.Sprintf("INSDQualifier_name:%s", str[1:])
				acc = append(acc, ql)
				if isPipe {
					acc = append(acc, "-element", "%INSDQualifier_value")
				} else {
					acc = append(acc, "-element", "\"%INSDQualifier_value\"")
				}
			} else {
				acc = append(acc, "-match")
				ql := fmt.Sprintf("INSDQualifier_name:%s", str)
				acc = append(acc, ql)
				acc = append(acc, "-element", "INSDQualifier_value")
			}
		}
	}

	return acc
}

// PARSE COMMAND-LINE ARGUMENTS

const (
	_ = iota
	UNIT
	SUBSET
	SECTION
	BLOCK
	BRANCH
	GROUP
	DIVISION
	PATTERN
)

// different names of exploration control arguments allow multiple levels of nested "for" loops in linear command line
// (capitalized versions for backward-compatibility with original Perl implementation handling of recursive definitions)
var (
	lcname = []string{"", "-unit", "-subset", "-section", "-block", "-branch", "-group", "-division", "-pattern"}

	ucname = []string{"", "-Unit", "-Subset", "-Section", "-Block", "-Branch", "-Group", "-Division", "-Pattern"}
)

func FindNextLevel(args []string, level int) (int, string, string) {

	if len(args) > 1 {

		for level > 0 {

			lctag := lcname[level]
			uctag := ucname[level]

			for _, txt := range args {
				if txt == lctag || txt == uctag {
					return level, lctag, uctag
				}
			}

			level--
		}
	}

	return 0, "", ""
}

func SubsetCommands(args []string, tagMatch bool) *Block {

	if !tagMatch {
		return &Block{Commands: args}
	}

	max := len(args)

	visit := ""

	// extract name of object to visit
	if max > 1 {
		visit = args[1]
		args = args[2:]
		max -= 2
	}

	if max < 1 {
		return &Block{Visit: visit}
	}

	position := ""

	// extract position argument
	if max > 1 && args[0] == "-position" {
		position = args[1]
		args = args[2:]
		max -= 2
	}

	if max < 1 {
		return &Block{Visit: visit, Position: position}
	}

	partition := 0
	for cur, str := range args {

		// record point between conditionals and remaining commands
		partition = cur

		// skip if not a command
		if len(str) < 1 || str[0] != '-' {
			continue
		}
		if str == "-match" || str == "-avoid" || str == "-and" || str == "-or" || str == "-contains" {
			continue
		}
		// numeric tests on element values for -match (undocumented)
		if str == "-gt" || str == "-ge" || str == "-lt" || str == "-le" || str == "-eq" || str == "-ne" {
			continue
		}

		// break out of loop
		break
	}

	// separate conditional and execution arguments
	conditions := args[0:partition]
	commands := args[partition:]

	blk := &Block{Visit: visit, Position: position, Conditions: conditions, Commands: commands}
	return blk
}

func ParseCommands(parent *Block, startLevel int) {

	args := parent.Commands

	level, lctag, uctag := FindNextLevel(args, startLevel)

	if level < 1 {

		if parent.Visit != "" && len(args) > 0 {

			blk := &Block{Commands: args}
			parent.Subtasks = append(parent.Subtasks, blk)
			parent.Commands = nil
		}

		// break recursion
		return
	}

	cur := 0

	tagMatch := false

	// search for positions of current exploration command

	for idx, txt := range args {
		if txt == lctag || txt == uctag {
			if idx == 0 {
				tagMatch = true
				continue
			}

			blk := SubsetCommands(args[cur:idx], tagMatch)
			ParseCommands(blk, level-1)
			parent.Subtasks = append(parent.Subtasks, blk)

			cur = idx
			tagMatch = true
		}
	}

	if cur < len(args) {
		blk := SubsetCommands(args[cur:], tagMatch)
		ParseCommands(blk, level-1)
		parent.Subtasks = append(parent.Subtasks, blk)
	}

	// clear execution arguments from parent after subsetting
	parent.Commands = nil
}

func ParseArguments(args []string) *Block {

	// parse nested exploration instruction from command-line arguments
	head := &Block{}
	for _, txt := range args {
		head.Commands = append(head.Commands, txt)
	}
	ParseCommands(head, PATTERN)

	if len(head.Subtasks) != 1 {
		return nil
	}

	// skip past empty placeholder
	return head.Subtasks[0]
}

// COLLECT AND FORMAT REQUESTED XML VALUES

// return matched element values to callback
func ExploreElements(curr *Node, prnt, match, attrib string, proc func(string)) {

	if curr == nil || proc == nil {
		return
	}

	if (curr.Name == match || (match == "" && attrib != "")) && (prnt == "" || curr.Parent == prnt) {
		if attrib != "" {
			for _, attr := range curr.Attributes {
				if attr.Name == attrib {
					proc(attr.Value)
					return
				}
			}
		} else if curr.Contents != "" {
			proc(curr.Contents)
			return
		} else if curr.Children != nil {
			// for container object, send empty string to callback to increment count
			proc("")
			// continue exploring
		}
	}

	for _, chld := range curr.Children {
		ExploreElements(chld, prnt, match, attrib, proc)
	}
}

// -element "*" prints XML subtree in one string
func PrintSubtree(curr *Node, proc func(string)) {

	if curr == nil || proc == nil {
		return
	}

	proc("<")
	proc(curr.Name)
	for _, attr := range curr.Attributes {
		proc(" ")
		proc(attr.Name)
		proc("=\"")
		proc(attr.Value)
		proc("\"")
	}
	proc(">")

	if curr.Contents != "" {

		str := html.EscapeString(curr.Contents)
		proc(str)

	} else {

		for _, chld := range curr.Children {
			PrintSubtree(chld, proc)
		}
	}

	proc("<")
	proc("/")
	proc(curr.Name)
	proc(">")
}

func ProcessElement(curr *Node, prnt, match, attrib string, status int, variables map[string]string, acc func(string)) {

	if curr == nil || acc == nil {
		return
	}

	isStar := false
	isVariable := false
	isCount := false
	isLength := false

	// check for special character at beginning of name
	if len(match) > 1 {
		switch match[0] {
		case '&':
			if IsAllCapsOrDigits(match[1:]) {
				isVariable = true
				match = match[1:]
			} else {
				fmt.Fprintf(os.Stderr, "\nUnrecognized variable '%s'\n", match)
				os.Exit(1)
			}
		case '#':
			isCount = true
			match = match[1:]
		case '%':
			isLength = true
			match = match[1:]
		default:
		}
	} else if match == "*" {
		isStar = true
	}

	if isStar {

		// -element "*" prints current XML subtree on one line

		var buffer bytes.Buffer

		PrintSubtree(curr,
			func(str string) {
				if str != "" {
					buffer.WriteString(str)
				}
			})

		txt := buffer.String()
		if txt != "" {
			acc(txt)
		}

	} else if isVariable {

		// use value of stored variable
		val, ok := variables[match]
		if ok {
			acc(val)
		}

	} else if isCount {

		count := 0

		ExploreElements(curr, prnt, match, attrib,
			func(str string) {
				count++
			})

		// number of element objects
		val := strconv.Itoa(count)
		acc(val)

	} else if isLength {

		length := 0

		ExploreElements(curr, prnt, match, attrib,
			func(str string) {
				length += len(str)
			})

		// length of element strings
		val := strconv.Itoa(length)
		acc(val)

	} else if status == ELEMENT {

		ExploreElements(curr, prnt, match, attrib,
			func(str string) {
				if str != "" {
					acc(str)
				}
			})

	} else if status == FIRST {

		single := ""

		ExploreElements(curr, prnt, match, attrib,
			func(str string) {
				if single == "" {
					single = str
				}
			})

		if single != "" {
			acc(single)
		}

	} else if status == LAST {

		single := ""

		ExploreElements(curr, prnt, match, attrib,
			func(str string) {
				single = str
			})

		if single != "" {
			acc(single)
		}

	} else if status == SUM {

		sum := 0
		ok := false

		ExploreElements(curr, prnt, match, attrib,
			func(str string) {
				value, err := strconv.Atoi(str)
				if err == nil {
					sum += value
					ok = true
				}
			})

		if ok {
			// sum of element values
			val := strconv.Itoa(sum)
			acc(val)
		}

	} else if status == MIN {

		min := 0
		ok := false

		ExploreElements(curr, prnt, match, attrib,
			func(str string) {
				value, err := strconv.Atoi(str)
				if err == nil {
					if !ok || value < min {
						min = value
					}
					ok = true
				}
			})

		if ok {
			// minimum of element values
			val := strconv.Itoa(min)
			acc(val)
		}

	} else if status == MAX {

		max := 0
		ok := false

		ExploreElements(curr, prnt, match, attrib,
			func(str string) {
				value, err := strconv.Atoi(str)
				if err == nil {
					if !ok || value > max {
						max = value
					}
					ok = true
				}
			})

		if ok {
			// maximum of element values
			val := strconv.Itoa(max)
			acc(val)
		}
	}
}

func ProcessClause(curr *Node, str, prev, pfx, sfx, sep string, status int, variables map[string]string) (string, bool) {

	ok := false
	num := 0

	// format results in buffer
	var buffer bytes.Buffer

	buffer.WriteString(prev)
	buffer.WriteString(pfx)
	between := ""

	// element names combined with commas are treated as a prefix-separator-suffix group
	comma := strings.Split(str, ",")
	for _, item := range comma {
		prnt, match := SplitInTwoAt(item, "/", RIGHT)
		match, attrib := SplitInTwoAt(match, "@", LEFT)

		ProcessElement(curr, prnt, match, attrib, status, variables,
			func(str string) {
				if str != "" {
					if status == ELEMENT || status == FIRST || status == LAST {
						buffer.WriteString(between)
						buffer.WriteString(str)
						between = sep
					} else {
						// evaluate sum/min/max on comma-separated elements
						value, err := strconv.Atoi(str)
						if err == nil {
							if status == SUM {
								num += value
							} else if status == MIN {
								if !ok || value < num {
									num = value
								}
							} else if status == MAX {
								if !ok || value > num {
									num = value
								}
							}
						}
					}
					ok = true
				}
			})
	}

	if status == SUM || status == MIN || status == MAX {
		if ok {
			val := strconv.Itoa(num)
			buffer.WriteString(val)
		}
	}

	buffer.WriteString(sfx)

	if !ok {
		return "", false
	}

	txt := buffer.String()

	return txt, true
}

func ProcessInstructions(cmds *Block, curr *Node, tab, ret string, variables map[string]string, accum func(string)) (string, string) {

	if accum == nil {
		return tab, ret
	}

	sep := "\t"
	pfx := ""
	sfx := ""

	col := "\t"
	lin := "\n"

	varname := ""

	status := UNSET

	// function to parse next argument
	NewCommandFlags := func(str string) {
		status = ParseFlag(str)
		switch status {
		case VARIABLE:
			varname = str[1:]
		case CLR:
			// clear tab (sent to ProcessClause as prev argument) after the fact (undocumented)
			tab = ""
			status = UNSET
		case RST:
			// reset pfx, sfx, and sep (undocumented)
			pfx = ""
			sfx = ""
			sep = "\t"
			status = UNSET
		case UNSET:
			fmt.Fprintf(os.Stderr, "\nNo -element before '%s'\n", str)
			os.Exit(1)
		case UNRECOGNIZED:
			fmt.Fprintf(os.Stderr, "\nUnrecognized argument '%s'\n", str)
			os.Exit(1)
		default:
		}
	}

	// process commands
	for _, str := range cmds.Commands {
		switch status {
		case UNSET:
			NewCommandFlags(str)
		case PFX:
			pfx = ConvertSlash(str)
			status = UNSET
		case SFX:
			sfx = ConvertSlash(str)
			status = UNSET
		case SEP:
			sep = ConvertSlash(str)
			status = UNSET
		case TAB:
			col = ConvertSlash(str)
			status = UNSET
		case RET:
			lin = ConvertSlash(str)
			status = UNSET
		case LBL:
			lbl := ConvertSlash(str)
			accum(tab)
			accum(lbl)
			tab = col
			ret = lin
			status = UNSET
		case ELEMENT, FIRST, LAST, SUM, MIN, MAX:
			if len(str) > 1 && str[0] == '-' {
				NewCommandFlags(str)
			} else {
				txt, ok := ProcessClause(curr, str, tab, pfx, sfx, sep, status, variables)
				if ok {
					tab = col
					ret = lin
					accum(txt)
				}
			}
		case VARIABLE:
			len := len(str)
			if len > 1 && str[0] == '(' && str[len-1] == ')' {
				// set variable from literal text inside parentheses
				variables[varname] = str[1 : len-1]
				// -match "&VARIABLE" will succeed if set to blank with empty parentheses "()"
			} else if str == "" {
				// -match "&VARIABLE" will fail if initialized with empty string ""
				delete(variables, varname)
			} else {
				txt, ok := ProcessClause(curr, str, "", pfx, sfx, sep, ELEMENT, variables)
				if ok {
					variables[varname] = txt
				}
			}
			varname = ""
			status = UNSET
		case UNRECOGNIZED:
			fmt.Fprintf(os.Stderr, "\nUnrecognized argument '%s'\n", str)
			os.Exit(1)
		default:
		}
	}

	return tab, ret
}

// -MATCH AND -AVOID CONDITIONAL TESTS

func HasElement(curr *Node, str, substr string, variables map[string]string) bool {

	match, val := SplitInTwoAt(str, ":", LEFT)
	prnt, match := SplitInTwoAt(match, "/", RIGHT)
	match, attrib := SplitInTwoAt(match, "@", LEFT)

	found := false

	isVariable := false

	// skip pound or percent character at beginning of name
	if len(match) > 1 {
		switch match[0] {
		case '&':
			if IsAllCapsOrDigits(match[1:]) {
				isVariable = true
				match = match[1:]
			} else {
				fmt.Fprintf(os.Stderr, "\nUnrecognized variable '%s'\n", match)
				os.Exit(1)
			}
		case '#':
			match = match[1:]
		case '%':
			match = match[1:]
		default:
		}
	}

	if isVariable {

		// use value of stored variable
		str, ok := variables[match]
		if ok {
			if val == "" || val == str {
				found = true
			}
		}

	} else if substr != "" {

		if val != "" {
			return false
		}

		ExploreElements(curr, prnt, match, attrib,
			func(str string) {
				if strings.Contains(strings.ToUpper(str), strings.ToUpper(substr)) {
					found = true
				}
			})

	} else {

		ExploreElements(curr, prnt, match, attrib,
			func(str string) {
				if val == "" || val == str {
					found = true
				}
			})
	}

	return found
}

func GetValue(curr *Node, str string, variables map[string]string) (int, bool) {

	// check for numeric argument
	number, err := strconv.Atoi(str)
	if err == nil {
		return number, true
	}

	match, val := SplitInTwoAt(str, ":", LEFT)
	prnt, match := SplitInTwoAt(match, "/", RIGHT)
	match, attrib := SplitInTwoAt(match, "@", LEFT)

	result := 0
	found := false

	if val != "" {
		return 0, false
	}

	isVariable := false
	isCount := false
	isLength := false

	// check for pound or percent character at beginning of name
	if len(match) > 1 {
		switch match[0] {
		case '&':
			if IsAllCapsOrDigits(match[1:]) {
				isVariable = true
				match = match[1:]
			} else {
				fmt.Fprintf(os.Stderr, "\nUnrecognized variable '%s'\n", match)
				os.Exit(1)
			}
		case '#':
			isCount = true
			match = match[1:]
		case '%':
			isLength = true
			match = match[1:]
		default:
		}
	}

	if isVariable {

		// use value of stored variable
		str, ok := variables[match]
		if ok {
			value, err := strconv.Atoi(str)
			if err == nil {
				result = value
				found = true
			}
		}

	} else if isCount {

		count := 0

		ExploreElements(curr, prnt, match, attrib,
			func(str string) {
				count++
				found = true
			})

		// number of element objects
		result = count

	} else if isLength {

		length := 0

		ExploreElements(curr, prnt, match, attrib,
			func(str string) {
				length += len(str)
				found = true
			})

		// length of element strings
		result = length

	} else {

		ExploreElements(curr, prnt, match, attrib,
			func(str string) {
				value, err := strconv.Atoi(str)
				if err == nil {
					result = value
					found = true
				}
			})
	}

	return result, found
}

func InRange(curr *Node, str, value string, status int, variables map[string]string) bool {

	// first argument must be an element/count/length/variable (numeric value will fail original -match)
	x, okx := GetValue(curr, str, variables)

	// second argument (after -lt, -ge, etc.) may be numeric, but can also be an element expression
	y, oky := GetValue(curr, value, variables)

	// both arguments must resolve to integers
	if !okx || !oky {
		return false
	}

	switch status {
	case GT:
		if y > x {
			return true
		}
	case GE:
		if y >= x {
			return true
		}
	case LT:
		if y < x {
			return true
		}
	case LE:
		if y <= x {
			return true
		}
	case EQ:
		if y == x {
			return true
		}
	case NE:
		if y != x {
			return true
		}
	default:
	}

	return false
}

func ConditionsAreSatisfied(cmds *Block, curr *Node, variables map[string]string) bool {

	required := 0
	observed := 0
	isMatch := false
	isAvoid := false

	// previous element name needs to be remembered for subsequent numeric tests
	prev := ""

	// test conditional arguments
	status := UNSET
	for _, str := range cmds.Conditions {
		switch status {
		case UNSET:
			status = ParseFlag(str)
		// -match and -avoid test presence or absence of element (or element with specific value)
		case AVOID:
			if isMatch && observed < required {
				return false
			}
			required = 0
			observed = 0
			isMatch = false
			isAvoid = true
			if HasElement(curr, str, "", variables) {
				return false
			}
			status = UNSET
		case MATCH:
			// checking for failure here allows for multiple -match [ -and / -or ] clauses
			if isMatch && observed < required {
				return false
			}
			required = 0
			observed = 0
			isMatch = true
			isAvoid = false
			// continue on to next two cases
			fallthrough
		case AND:
			required++
			// continue on to next case
			fallthrough
		case OR:
			prev = str
			if HasElement(curr, str, "", variables) {
				if isAvoid {
					return false
				}
				observed++
			}
			status = UNSET
		case CONTAINS:
			// substring test on element values for -match (undocumented)
			required++
			if HasElement(curr, prev, str, variables) {
				observed++
			}
			status = UNSET
		case GT, GE, LT, LE, EQ, NE:
			// numeric tests on element values for -match (undocumented)
			required++
			if InRange(curr, prev, str, status, variables) {
				observed++
			}
			status = UNSET
		default:
		}
	}

	if isMatch && observed < required {
		return false
	}

	return true
}

// RECURSIVELY PROCESS EXPLORATION COMMANDS AND XML DATA STRUCTURE

func ExploreNodes(curr *Node, prnt, match string, proc func(*Node)) {

	if curr == nil || proc == nil {
		return
	}

	// match is "*" for heterogeneous data constructs, e.g., -pattern PubmedArticleSet/*
	if (curr.Name == match || match == "*") && (prnt == "" || curr.Parent == prnt) {
		proc(curr)
		return
	}

	// clearing prnt "*" now allows exploration within recursive data, e.g., -pattern Taxon -block */Taxon
	if prnt == "*" {
		prnt = ""
	}

	// explore child nodes
	for _, chld := range curr.Children {
		ExploreNodes(chld, prnt, match, proc)
	}
}

func ProcessCommands(cmds *Block, curr *Node, tab, ret string, variables map[string]string, accum func(string)) (string, string) {

	if accum == nil {
		return tab, ret
	}

	if len(cmds.Commands) > 0 {
		return ProcessInstructions(cmds, curr, tab, ret, variables, accum)
	}

	if cmds.Visit == "" {
		return tab, ret
	}

	// closure passes local variables to callback
	ProcessNode := func(node *Node) {

		// apply -match or -avoid tests
		if ConditionsAreSatisfied(cmds, node, variables) {

			// process sub commands on child node
			for _, sub := range cmds.Subtasks {
				tab, ret = ProcessCommands(sub, node, tab, ret, variables, accum)
			}
		}
	}

	prnt, match := SplitInTwoAt(cmds.Visit, "/", RIGHT)

	if cmds.Position == "" {

		ExploreNodes(curr, prnt, match, ProcessNode)

	} else if cmds.Position == "first" {

		var single *Node

		ExploreNodes(curr, prnt, match,
			func(node *Node) {
				if single == nil {
					single = node
				}
			})

		ProcessNode(single)

	} else if cmds.Position == "last" {

		var single *Node

		ExploreNodes(curr, prnt, match,
			func(node *Node) {
				single = node
			})

		ProcessNode(single)

	} else {

		// use numeric position (undocumented)
		number, err := strconv.Atoi(cmds.Position)
		if err == nil {

			pos := 0

			ExploreNodes(curr, prnt, match,
				func(node *Node) {
					pos++
					if pos == number {
						ProcessNode(node)
					}
				})

		} else {

			fmt.Fprintf(os.Stderr, "\nUnrecognized position '%s'\n", cmds.Position)
			os.Exit(1)
		}
	}

	return tab, ret
}

// PARTITION TOKEN STREAM BY -PATTERN ARGUMENT, PROCESS EACH COMPONENT RECORD AS SOON AS IT IS READ

func ParseXml(start, parent string, rdr *XmlReader) (*Node, bool) {

	if rdr == nil {
		return nil, false
	}

	ok := true
	node := &Node{Name: start, Parent: parent}

	for {
		// read next token
		cat, val, ok := NextToken(rdr)
		if !ok {
			break
		}
		if cat == SENTINEL {
			return nil, false
		}

		switch cat {
		case START_TAG:
			obj, ok := ParseXml(val, node.Name, rdr)
			if !ok {
				break
			}
			// append to children
			node.Children = append(node.Children, obj)
		case ATTRIBUTES:
			attributes := val
			for attributes != "" {
				tag, rem := SplitInTwoAt(attributes, "=\"", LEFT)
				val, nxt := SplitInTwoAt(rem, "\"", LEFT)
				attributes = strings.TrimSpace(nxt)
				attr := &Attrib{Name: tag, Value: val}
				node.Attributes = append(node.Attributes, *attr)
			}
		case CONTENTS:
			node.Contents = html.UnescapeString(val)
		case END_TAG:
			// pop out of recursive call
			return node, ok
		case SENTINEL:
			return nil, false
		default:
		}
	}

	return node, ok
}

func ProcessPattern(start, parent string, cmds *Block, rdr *XmlReader) bool {

	if cmds == nil || rdr == nil {
		return false
	}

	// exit from function will collect garbage of node structure for current XML object
	pat, ok := ParseXml(start, parent, rdr)

	if !ok {
		return false
	}

	// exit from function will also free map of recorded variables for current -pattern
	variables := make(map[string]string)

	var buffer bytes.Buffer

	// start processing at top of command tree and top of XML subregion selected by -pattern
	_, ret := ProcessCommands(cmds, pat, "", "", variables,
		func(str string) {
			if str != "" {
				buffer.WriteString(str)
			}
		})

	if ret != "" {
		buffer.WriteString(ret)
	}

	txt := buffer.String()

	if txt != "" {
		// print consolidated result string
		fmt.Printf("%s", txt)
	}

	return true
}

func ProcessXml(cmds *Block, pattern string, rdr *XmlReader) {

	if cmds == nil || pattern == "" || rdr == nil {
		return
	}

	// look for -pattern parent/* construct for heterogeneous data, e.g., -pattern PubmedArticleSet/*
	topPat, star := SplitInTwoAt(pattern, "/", LEFT)
	if topPat == "" {
		return
	}

	// scan xml tags, process as soon as each top-level pattern is read
	for {
		cat, val, ok := NextToken(rdr)
		if !ok {
			break
		}
		if cat != START_TAG {
			continue
		}
		if val != topPat {
			continue
		}

		if star == "*" {
			// read and process heterogeneous objects immediately below -pattern parent
			for {
				cat, val, ok := NextToken(rdr)
				if !ok {
					break
				}
				if cat != START_TAG {
					continue
				}
				ok = ProcessPattern(val, topPat, cmds, rdr)
				if !ok {
					return
				}
			}
		} else {
			// read and process one -pattern object at a time
			ok := ProcessPattern(val, "", cmds, rdr)
			if !ok {
				return
			}
		}
	}
}

// MAIN FUNCTION

// e.g., xtract -pattern PubmedArticle -element MedlineCitation/PMID -block Author -element Initials,LastName

func main() {

	// skip past executable name
	args := os.Args[1:]

	if len(args) < 1 {
		fmt.Fprintf(os.Stderr, "\nNo command-line arguments supplied to xtract\n")
		os.Exit(1)
	}

	// -version
	if args[0] == "-version" {
		fmt.Printf("%s\n", xtract_version)
		return
	}

	// -help prints summary of commands and constructs
	if args[0] == "-help" {
		fmt.Printf("xtract %s (Go version)\n%s\n", xtract_version, xtract_help)
		return
	}

	// -undocumented prints summary of currently undocumented commands (other than itself)
	if args[0] == "-undocumented" {
		fmt.Printf("xtract %s (Go version)\n%s\n", xtract_version, xtract_undocumented)
		return
	}

	// -examples prints examples of EDirect and xtract usage
	if args[0] == "-examples" || args[0] == "-example" {
		fmt.Printf("xtract %s (Go version)\n%s\n", xtract_version, xtract_examples)
		return
	}

	if len(args) < 1 {
		fmt.Fprintf(os.Stderr, "\nInsufficient command-line arguments supplied to xtract\n")
		os.Exit(1)
	}

	// -compress, -cleanup, and -repair flags (undocumented)
	doCompress := false
	doCleanup := false
	doRepair := false

	for {
		if args[0] == "-compress" {
			doCompress = true
		} else if args[0] == "-cleanup" {
			doCleanup = true
		} else if args[0] == "-repair" {
			doRepair = true
		} else {
			break
		}
		args = args[1:]

		if len(args) < 1 {
			fmt.Fprintf(os.Stderr, "\nInsufficient command-line arguments supplied to xtract\n")
			os.Exit(1)
		}
	}

	// handle special cases

	// -format reformats XML data
	if args[0] == "-format" {
		ProcessFormat(args[1:], doCompress, doCleanup, doRepair)
		return
	}

	// -outline displays container names and indentation of XML data
	if args[0] == "-outline" {
		ProcessOutline(doCompress, doCleanup, doRepair)
		return
	}

	// -synopsis displays count of full paths
	if args[0] == "-synopsis" {
		ProcessSynopsis(doCompress, doCleanup, doRepair)
		return
	}

	// -verify checks integrity of XML (undocumented)
	if args[0] == "-verify" || args[0] == "-validate" {
		ProcessVerify(doCompress, doCleanup, doRepair)
		return
	}

	// -insd to simplify extraction of INSDSeq qualifiers
	if args[0] == "-insd" {

		args = args[1:]
		fi, _ := os.Stdin.Stat()
		isPipe := bool((fi.Mode() & os.ModeCharDevice) == 0)

		insd := ProcessINSD(args, isPipe)

		if !isPipe {
			// no piped input, so write output instructions
			fmt.Printf("xtract")
			for _, str := range insd {
				fmt.Printf(" %s", str)
			}
			// fmt.Printf("| \\\n")
			fmt.Printf("\n")
			return
		}

		// data in pipe, so replace arguments, execute dynamically
		args = insd
	}

	if len(args) < 3 {
		fmt.Fprintf(os.Stderr, "\nInsufficient command-line arguments supplied to xtract\n")
		os.Exit(1)
	}

	// parse extraction arguments

	// make sure top-level -pattern command is next
	topPattern := ""
	if args[0] == "-pattern" || args[0] == "-Pattern" {
		topPattern = args[1]
	}
	if topPattern == "" {
		fmt.Fprintf(os.Stderr, "\nNo -pattern in command-line arguments\n")
		os.Exit(1)
	}

	// parse nested exploration instruction from command-line arguments
	cmds := ParseArguments(args)
	if cmds == nil {
		fmt.Fprintf(os.Stderr, "\nProblem parsing command-line arguments\n")
		os.Exit(1)
	}

	// parse XML and execute extraction commands

	// create XML parser
	rdr := NewXmlReader(bufio.NewReader(os.Stdin), doCompress, doCleanup, doRepair)
	if rdr == nil {
		fmt.Fprintf(os.Stderr, "\nUnable to create XML Reader\n")
		os.Exit(1)
	}

	// group tokens by top pattern, parse and process each record
	ProcessXml(cmds, topPattern, rdr)
}
