We are in the process of ingesting a backlog of 400+ digitised legacy publications (from as far back as the 1970s) into SALDRU’s production DSpace instance at work [1]. The digitised PDF documents were shared via Dropbox and corresponding RDF files (containing metadata records) [2] sent seperately. I resorted to using DSpace’s Simple Archive Format [3] using a Python script that uses the metadata elements in the RDF files to dynamically create the archive directories for each resource; create corresponding dublin_core.xml and content files; barrow down into my Dropbox folder to search for the PDF files and copy them to related archive directories.
Psedocode
The logic used to implement the script is very rudimentary and in shown below in it’s basic form
For each RDF file For each record create archive directory build& write dublin_core.xml file barrow into Dropbox folder& check for corresponding bitstream (PDF file) copy bitstream create content file
Code
Function to scrap off metadata elements from RDF file and subsequently create archives for individuals resources.
def repecrdfextracter(rdffilepath, safarchive, searchlocation): """A function to parse& convert a RePEc RDF file into a DSpace Simple Archive Format ingestion file. keyword arguments: rdffile --RDF source file safarchive --base location where archive will be created searchlocation --base location where bitsreams are located """ with open(rdffilepath) as rdffile: rdfdata = rdffile.read().replace('\r', '').replace('\n', ' ') # identifier record header rdfdata = rdfdata.replace('Template-Type', '#####Template-Type') # abstracts rdfdata = rdfdata.replace('Abstract:', '$$$Abstract:') # Author names rdfdata = rdfdata.replace('Author-Name:', '$$$Author-Name:') # author workplace rdfdata = rdfdata.replace('Author-Workplace-Name:', '$$$Author-Workplace-Name:') # creation date rdfdata = rdfdata.replace('Creation-Date:', '$$$Creation-Date:') # file format rdfdata = rdfdata.replace('File-Format:', '$$$File-Format:') # file function rdfdata = rdfdata.replace('File-Function:', '$$$File-Function:') # file URL rdfdata = rdfdata.replace('File-URL:', '$$$File-URL:') # handle rdfdata = rdfdata.replace('Handle:', '$$$Handle:') # keyworkds rdfdata = rdfdata.replace('Keywords:', '$$$Keywords:') # number rdfdata = rdfdata.replace('Number:', '$$$Number:') # price rdfdata = rdfdata.replace('Price:', '$$$Price:') # title rdfdata = rdfdata.replace('Title:', '$$$Title:') # split into records ##print rdfdata print "RDF LENGTH: ", len(rdfdata) records = rdfdata.split("#####") print "RECORDS LENgTH: ", len(records) print "RECORDS TYPE: ", type(records) # for each record, split the fields in file for record in records: # render record safroot = etree.Element('dublin_core') fields = record.split('$$$') saffile = "" filestatus = "" safsubcollection = "" if len(record) > 0: for field in fields: # append filed as XML root element child node # check if field should be processed key = field.split(':')[0].strip() value = field[len(field.split(':')[0]) + 1:].strip() # derive sub-collection name using Creation-Date RDF field if key == 'Creation-Date': if value == "": safsubcollection = "1990-1999" else: if (int(value) >= 1970) & (int(value) < 1979): safsubcollection = "1970-1979" elif (int(value) >= 1980) & (int(value) < 1989): safsubcollection = "1980-1989" elif (int(value) >= 1990) & (int(value) < 1999): safsubcollection = "1990-1999" elif (int(value) >= 2000) & (int(value) < 2009): safsubcollection = "2000-2009" elif (int(value) >= 2010) & (int(value) < 2019): safsubcollection = "2010-2019" else: safsubcollection = "1990-1999" print "Creation-Date:", value, "Subcollection:", safsubcollection # derive bitstream name using File-URL RDF field if key == 'File-URL': if value == '': saffile = 'UNKNOWN' else: saffile = value print "THE FILE-URL: ", value safinput = safchildnode(key, value) # check if element and qualifier is what we want. also check if # value is not null if (safinput[1] != '') & (safinput[2] != '') & (safinput[3] != ''): dcvalue = etree.Element('dcvalue') dcvalue.set('element', safinput[2]) dcvalue.set('qualifier', safinput[3]) #print "CHECKING: ", safinput[1] dcvalue.text = safinput[1] safroot.append(dcvalue) # create publisher default child node dcvalue = etree.Element('dcvalue') dcvalue.set('element', 'publisher') dcvalue.set('qualifier', 'none') dcvalue.text = "Southern Afircan Labour and Development Research Unit" safroot.append(dcvalue) # make item directory safitemdirectory = os.path.abspath(os.path.join(safarchive, os.path.basename(rdffilepath).split('.')[0], safsubcollection, saffile.split('.')[0])) if not os.path.exists(safitemdirectory): os.makedirs(safitemdirectory) print 'FOZA CREATED: ', os.path.exists(safitemdirectory) # copy bitstream from source to item dir safitembitstreamfile = saffindbitstream(searchlocation, saffile) try: shutil.copy(safitembitstreamfile, safitemdirectory) print "Copying: ", safitembitstreamfile, " to: ", safitemdirectory filestatus = "FILEFOUND" print "File:", saffile, ",STATUS:", filestatus, ",RDFfile:", rdffilepath except Exception as details: print "Exception FOUND when copying", details # if pdf file is missing continue loop filestatus = "FILENOTFOUND" # delete itemdirectory if file does not exist shutil.rmtree(safitemdirectory) print "File:", saffile, ",STATUS:", filestatus, ",RDFfile:", rdffilepath continue # create content file with bitstreamname in it safcontentsfile = os.path.abspath(os.path.join(safitemdirectory, 'contents')) safcontentstowrite = saffile with open(safcontentsfile, 'w') as safcontents: safcontents.write(safcontentstowrite) # write dublin_core.xml file with appropriate details safmetadatafile = os.path.abspath(os.path.join(safitemdirectory, 'dublin_core.xml')) with open(safmetadatafile, 'w') as safmetadata: safmetadata.write(etree.tostring(safroot, encoding='utf-8', xml_declaration=True, pretty_print=True))
Function to create Simple Archive Format XML child nodes
def safchildnode(key, value): """A function that creates saf child nodes keyword argumens key --key entry value --value entry e.g Author-Name: Lighton Phiri """ qualifier = "" element = "" if key == "Abstract": element = "description" qualifier = "abstract" elif key == "Author-Name": element = "contributor" qualifier = "author" elif key == "Creation-Date": element = "date" qualifier = "issued" elif key == "Title": element = "title" qualifier = "none" elif key == "Keywords": element = "description" qualifier = "none" return (key, value, element, qualifier)
Function to barrow a given directory and search for bitstreams
def saffindbitstream(searchlocation, bitstream): """A function to search for specified bitstream """ absolutefilepath = "" for root, dirs, files in os.walk(searchlocation): for filename in files: if os.path.basename(filename) == bitstream: #print os.path.abspath(os.path.join(root, filename) absolutefilepath = os.path.abspath(os.path.join(root, filename)) #print os.path.abspath(os.path.join(root, filename)) return absolutefilepath
In addition, I have a basic shell script that call the Python script by feeding it the RDF file to process and the locations on the filesystem where the archives will be created and where the Dropbox folder is located.
#!/bin/bash -l # usage: ./rdfimport safdestinationdir bitstreamsourcedir # # run script in directory where RDF files are located # ./rdfimport /home/scap/Sandbox/saldru/wpbacklog /home/scap/Dropbox # for rdffile in *rdf do python -c "import saldrudspaceingest; saldrudspaceingest.repecrdfextracter('$rdffile', '$1', '$2')"; done
Bibliography
[1] http://www.scaprogramme.org.za/participating-institutions/university-of-cape-town
[2] http://ideas.repec.org/t/papertemplate.html
[3] https://wiki.duraspace.org/display/DSDOC3x/Importing+and+Exporting+Items+via+Simple+Archive+Format