#!/usr/bin/python
""" glean.py -- GRDDL client implementation

USAGE:
  python glean.py --help
  python glean.py [opts] --output output.rdf http://.../input.html
  opts:
    --content=input-copy
    --debug
    
  to glean RDF statements as per transformations in input
  and write them to output.rdf.

  input-copy.html is an option local copy of the input document.


REQUIREMENTS:
  /usr/bin/xsltproc

REFERENCES:
 GRDDL Data Views: Getting Started, Learning More
 http://www.w3.org/2003/g/data-view
and
 Gleaning Resource Descriptions from Dialects of Languages (GRDDL)
 http://www.w3.org/2004/01/rdxh/spec
"""

#TODO: peeking-into-profile-docs (working; need to update reference docs)
#
#ISSUE: I'd rather use pipes than temp files,
# but spawn() is more secure than popen()
# since it allows args to be parsed individually
# rather than as one command string to be parsed
# again by a shell.
#
#
#LICENSE: Open Source: Share and Enjoy.
#
#GRDDL Workspace: http://www.w3.org/2003/g/
#
#Copyright 2002-2003 World Wide Web Consortium, (Massachusetts
#Institute of Technology, European Research Consortium for
#Informatics and Mathematics, Keio University). All Rights
#Reserved. This work is distributed under the W3C(R) Software License
#  http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231
#in the hope that it will be useful, but WITHOUT ANY WARRANTY;
#without even the implied warranty of MERCHANTABILITY or FITNESS FOR
#A PARTICULAR PURPOSE.

# See change log at end
__version__ = '$Id: glean.py,v 1.1 2004/12/13 19:01:57 ben Exp $'

import sys, getopt
import os, tempfile
import urllib2

import RDF
# Redland RDF Application Framework - Python API Reference:
# http://www.redland.opensource.ac.uk/docs/pydoc/RDF.html#Model
# tested with: http://packages.debian.org/python2.3-librdf 0.9.16-1_i386

XSLTPROC="/usr/bin/xsltproc"
GET_TRANSFORMS="getTransforms.xsl"

HTMLns = 'http://www.w3.org/1999/xhtml'
DataViewDoc = "http://www.w3.org/2003/g/data-view"
DataView = RDF.NS(DataViewDoc + "#")

RDF_mediaType = "application/rdf+xml"

def grddl(addr, inf=None, already=[DataViewDoc, HTMLns]):
    """ glean formal meaning of doc at addr and return as an RDF Graph (Model)

    inf optionally names a file that has the contents of the doc at addr.

    """

    progress("grddl(", (addr, inf, already),")")
    
    if inf is None:
        (tmp, meta) = localCopy(addr,
                                (RDF_mediaType, "text/html", "text/xml", "*"))
        try:
            if meta["content-type"].startswith(RDF_mediaType):
                return parseRDF(tmp, addr)
            else:
                return grddl(addr, tmp, already)
        finally:
            os.remove(tmp)

    data = RDF.Model()
    rdfp = RDF.Parser(name="rdfxml")
    
    (rdffd, rdffn) = tempfile.mkstemp(".rdf", "grddl-statements")
    os.close(rdffd)

    try:
        for xform in getTransforms(inf, addr, already):
            doXSLT(xform, inf, rdffn, {"xmlfile": addr})
            rdfp.parse_into_model(data, "file:" + rdffn, RDF.Uri(addr))
            progress("data size:", data.size())
        return data
    finally:
        os.remove(rdffn)


def parseRDF(inf, addr):
    graph = RDF.Model()
    rdfp = RDF.Parser(name="rdfxml")
    
    rdfp.parse_into_model(graph, "file:" + inf, RDF.Uri(addr))
    return graph


def localCopy(addr, acceptTypes):
    req = urllib2.Request(url=addr)
    req.add_header("Accept", ",".join(acceptTypes))
    f = urllib2.urlopen(req)
    (tfd, tnam) = tempfile.mkstemp(".txt", "grddl-xforms")
    tf = os.fdopen(tfd, "w")
    tf.write(f.read())
    tf.close()
    return tnam, f.info()


def getTransforms(inf, base, already):
    (outfd, outfname) = tempfile.mkstemp(".txt", "grddl-xforms")
    os.close(outfd)
    ret = []
    try:
        doXSLT(GET_TRANSFORMS, inf, outfname,
               {"xmlfile": base})
        outfp = open(outfname)

        profiles = []

        for l in outfp.readlines():
            progress("getTransforms line:", l.rstrip("\r\n"))
            k, u = l.rstrip("\r\n").split(None, 1)
            if k in ('P', 'R'): profiles.append(u)
            else: ret.append(u)
    finally:
        os.remove(outfname)

    # @@add profileTransformation to spec, namespace doc
    for profile in profiles:
        if profile in already: continue
        progress("PROFILE: ", profile)

        try:
            profileData = grddl(profile, None, already + [profile])
        except IOError: # @@ also catch parsing errors
            continue
        
        q = RDF.Statement(subject = RDF.Uri(profile),
                          predicate = DataView.profileTransformation,
                          object = None)
        for s in profileData.find_statements(q):
            progress("PROFILE XFORM: ", s.object.uri)
            ret.append(str(s.object.uri))
        q = RDF.Statement(subject = RDF.Uri(profile),
                          predicate = DataView.namespaceTransformation,
                          object = None)
        for s in profileData.find_statements(q):
            progress("NAMESPACE XFORM: ", s.object.uri)
            ret.append(str(s.object.uri))
        already.append(profile)
    return ret


def doXSLT(xform, inf, outf, params = {}):
    args = ["xsltproc", "--novalid", "-o", outf]
    for k in params.keys():
        args.extend(("--stringparam", k, params[k]))

    spawn(XSLTPROC, args + [xform, inf])


def spawn(path, args):
    progress("SPAWN: ", " ".join(args))
    status = os.spawnv(os.P_WAIT, path, args)
    if status == 0: return
    raise IOError, (status, path, args)


DEBUG=0
def progress(*args):
    if not DEBUG: return
    for a in args:
        sys.stderr.write(str(a))
    sys.stderr.write("\n")

# All Things Pythonic
# Python main() functions
# by Guido van Rossum
# May 15, 2003
# http://www.artima.com/weblogs/viewpost.jsp?thread=4829


def main(argv=None):
    if argv is None: argv = sys.argv

    try:
        opts, args = getopt.getopt(argv[1:], "ho:c:d",
                                   ["help", "output=", "copy=", "debug"])
    except getopt.GetoptError:
        usage()
        return 2
    output = None
    inf = None
    global DEBUG
    for o, a in opts:
        if o in ("-d", "--debug"):
            DEBUG = 1
        if o in ("-h", "--help"):
            usage()
            return 0
        if o in ("-o", "--output"):
            output = a
        if o in ("-c", "--copy"):
            inf = a
    if not (output and len(args) == 1):
        usage()
        return 2
    
    data = grddl(args[0], inf)
    progress("GRDDL size:", data.size())
    RDF.Serializer().serialize_model_to_file(output, data)

    return 0

def usage():
    print __doc__
    print __version__
    
if __name__ == '__main__':
    sys.exit(main())


# $Log: glean.py,v $
# Revision 1.1  2004/12/13 19:01:57  ben
# added test dir
#
# Revision 1.10  2004/06/04 20:53:40  connolly
# recover from 404s when following namespace and profile links
#
# Revision 1.9  2004/06/04 20:16:32  connolly
# - general XML support
# - peeking-into-namespace docs
#
# Revision 1.8  2004/06/04 18:39:53  connolly
# - fix recursion loop
# - getopt, usage()
#
# Revision 1.7  2004/05/28 19:35:21  connolly
# fixed usage doc
#
# Revision 1.6  2004/05/28 19:16:25  connolly
# point to debian package for python redland
#
# Revision 1.5  2004/05/28 19:07:23  connolly
# - peeking-into-profile-docs works!
#   - grddl is called recursively, so there's a new arg,
#     already, to prevent loops
# - input file is now optional; command-line args reordered
#   - likewise grddl() args changed
#
# Revision 1.4  2004/05/28 18:01:55  connolly
# in preparation for peeking-into-profile docs,
# use redland RDF API to merge transform
# outputs, rather than reroot.xsl hack
#
# Revision 1.3  2004/05/28 16:22:44  connolly
# - use mkstemp() to get rid of hard-coded temp filenames
# - code cleanup:
#   - full module docstring
#   - moved main closer to article citation
#
