mlhim-specs-dev team mailing list archive

Thread
Date

[Branch ~cdd-dev/cdd/trunk] Rev 237: Added to to create CCDs from NCI CDE data.

To: MLHIM Specifications Developers <mlhim-specs-dev@xxxxxxxxxxxxxxxxxxx>
From: noreply@xxxxxxxxxxxxx
Date: Thu, 30 Aug 2012 16:42:11 -0000
Reply-to: noreply@xxxxxxxxxxxxx
Sender: bounces@xxxxxxxxxxxxx

------------------------------------------------------------
revno: 237
committer: Timothy W. Cook <timothywayne.cook@xxxxxxxxx>
branch nick: cdd
timestamp: Thu 2012-08-30 13:40:52 -0300
message:
  Added to to create CCDs from NCI CDE data.
added:
  src/xls2ccd/
  src/xls2ccd/README.txt
  src/xls2ccd/readxls.py
  src/xls2ccd/xls2ccd.py


--
lp:cdd
https://code.launchpad.net/~cdd-dev/cdd/trunk

Your team MLHIM Specifications Developers is subscribed to branch lp:cdd.
To unsubscribe from this branch go to https://code.launchpad.net/~cdd-dev/cdd/trunk/+edit-subscription

=== added directory 'src/xls2ccd'
=== added file 'src/xls2ccd/README.txt'
--- src/xls2ccd/README.txt	1970-01-01 00:00:00 +0000
+++ src/xls2ccd/README.txt	2012-08-30 16:40:52 +0000
@@ -0,0 +1,30 @@
+xls2ccd.py
+REQUIRES: Python 2.6/2.7 and xlrd  
+
+This utility is used to create MLHIM CCDs  from standard template, XLS downloads from the NCI CDE.
+https://cdebrowser.nci.nih.gov/CDEBrowser/  
+
+Some pre-processing is required. 
+
+Download a set of CDEs in Excel format using the "Available Downloads" link near the top-right of the page. 
+Select the set you want to convert from the links available on the "caDSR CDE Downloads" page.
+
+Be sure that the filename contains "NCI_Standard_Template (MM)_(DD)_2012".  Other formats have not been tested.  Please let us know which dates of the Standard TEmplate works for you.
+
+Open the spreadsheet.  
+Delete rows 1 - 10.
+Delete columns A & B.
+
+Save (in .xls format) the modified spreadsheet into the same directory as this utility. 
+
+Execute the tool with this commandline:
+python xls2ccd.py <filename> 
+
+All CCDs created that are not of the CHARACTER or ALPHANUMERIC datatype will be flagged with an "R" as the first character of thier filename.  These must be reviewed manually before use.  When all corrections are made, remove the "R" from the CCD element name and then save the file w/o the "R".
+
+readxls.py is a development tool to explore the files for improvements. 
+
+
+
+
+

=== added file 'src/xls2ccd/readxls.py'
--- src/xls2ccd/readxls.py	1970-01-01 00:00:00 +0000
+++ src/xls2ccd/readxls.py	2012-08-30 16:40:52 +0000
@@ -0,0 +1,42 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import sys
+
+from xlrd import open_workbook
+
+xlsfile = sys.argv[1]
+
+wb = open_workbook(xlsfile)
+ccd_dict = {}
+currkey = ''
+
+for s in wb.sheets():
+    print 'Sheet:',s.name
+    print 'Cols.: ',s.ncols
+    print 'Rows: ', s.nrows
+    for row in range(s.nrows):
+        values = []
+        for col in range(s.ncols):
+            values.append(s.cell(row,col).value)
+        if values[0]:
+            currkey = values[0]
+            ccd_dict[currkey] = [values[0],values[1],values[2],values[3],values[4],values[5],values[6],values[7]]
+        elif currkey:
+            if values[8]:
+                ccd_dict[currkey].append((values[8],values[9],values[10]))
+
+for k in ccd_dict.keys():
+    size = len(ccd_dict[k])
+    print "Title: "+ccd_dict[k][0]
+    #print "Description: "+ccd_dict[k][1]
+    #print "Public ID: "+ccd_dict[k][2]
+    #print "CDE Version: "+ccd_dict[k][3]
+    #print ": "+ccd_dict[k][4]
+    print "Datatype: "+ccd_dict[k][5]
+    print "UOM: "+ccd_dict[k][6]
+    print "Format: "+ccd_dict[k][7]
+    #if size > 8:
+        #for n in range(8,size):
+            #print "Enumeration: ", ccd_dict[k][n]
+    print '\n===========================================\n'
\ No newline at end of file

=== added file 'src/xls2ccd/xls2ccd.py'
--- src/xls2ccd/xls2ccd.py	1970-01-01 00:00:00 +0000
+++ src/xls2ccd/xls2ccd.py	2012-08-30 16:40:52 +0000
@@ -0,0 +1,227 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import sys
+import uuid
+import time
+from xlrd import open_workbook
+
+xlsfile = sys.argv[1]
+
+wb = open_workbook(xlsfile)
+ccd_dict = {}
+currkey = ''
+
+for s in wb.sheets():
+    print 'Sheet:',s.name
+    print 'Cols.: ',s.ncols
+    print 'Rows: ', s.nrows
+    for row in range(s.nrows):
+        values = []
+        for col in range(s.ncols):
+            values.append(s.cell(row,col).value)
+        if values[0]:
+            currkey = values[0]
+            ccd_dict[currkey] = [values[0],values[1],values[2],values[3],values[4],values[5],values[6],values[7]]
+        elif currkey:
+            if values[8]:
+                ccd_dict[currkey].append((values[8],values[9],values[10]))
+
+for k in ccd_dict.keys():
+    size = len(ccd_dict[k])
+    print "Title: "+ccd_dict[k][0]
+    print "Description: "+ccd_dict[k][1]
+    print "Public ID: "+ccd_dict[k][2]
+    print "CDE Version: "+ccd_dict[k][3]
+    print ": "+ccd_dict[k][4]
+    print "Datatype: "+ccd_dict[k][5]
+    print "UOM: "+ccd_dict[k][6]
+    print "Format: "+ccd_dict[k][7]
+    if size > 8:
+        for n in range(8,size):
+            print "Enumeration: ", ccd_dict[k][n]
+    print '\n===========================================\n'
+
+    #ccd setup
+    ccd_id = "ccd_"+str(uuid.uuid4()).replace('-','_')
+    if ccd_dict[k][5] not in ("CHARACTER","ALPHANUMERIC"):
+        ccd_id = "R"+ccd_id
+    ccdct = "ct_"+str(uuid.uuid4()).replace('-','_')
+    ect = "ct_"+str(uuid.uuid4()).replace('-','_')
+    dct = "ct_"+str(uuid.uuid4()).replace('-','_')
+    schema = ccd_id + ".xsd"
+    n = time.localtime()
+    now = "%d-%d-%d" % (n.tm_year, n.tm_mon, n.tm_mday)
+
+    #/ccd setup
+    print ccd_id
+    title = ccd_dict[k][0]
+    description = ccd_dict[k][1] + "\n" +"          CDE Version: "+ccd_dict[k][3] + "\n" + "           Datatype: "+ccd_dict[k][5] + "\n"+ "           UOM: "+ccd_dict[k][6] + "\n" + "           Format: "+ccd_dict[k][7]
+    date = str(now)
+    creator = "Generated by MLHIM xls2ccd.py"
+    contributor = ""
+    language = ("en-US")
+    publisher = ("INCT-MACC MLHIM Lab")
+    subject = ccd_dict[k][0]
+    source = ("NCI CDE Public ID: "+ccd_dict[k][2])
+    rights = ("CC-BY")
+    relation = ("None")
+    coverage = ("Global")
+    resource_type = "MLHIM Concept Constraint Definition (CCD)"
+    resource_format = "text/xml"
+    identifier = ccd_id
+
+    data_name = title.replace("'",'')
+    dt = ccd_dict[k][5]
+
+    f = open(schema,'w')
+
+    f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
+
+    f.write("""
+
+    <xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema#";
+          xmlns:mlhim2="http://www.mlhim.org/xmls/mlhim2/2_3_1";
+          elementFormDefault="qualified"
+          targetNamespace="http://www.mlhim.org/xmls/mlhim2/2_3_1";
+          xmlns:data-view="http://www.w3.org/2003/g/data-view#";
+          data-view:transformation="http://www.mlhim.org/ccd/ccd_md_view.xsl";>
+
+    <xs:annotation>
+     <xs:appinfo>
+     """)
+
+    f.write("""
+       This is a """+title+""" Concept Constraint Definition schema file (CCD).
+    <rdf:RDF
+         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#";
+         xmlns:dc= "http://purl.org/dc/elements/1.1/";>
+
+    <rdf:Description rdf:about="http://www.mlhim.org/ccd/"""; + ccd_id + """">
+      <dc:title>"""+title+"""</dc:title>
+      <dc:creator>"""+creator+"""</dc:creator>
+      <dc:contributor>"""+contributor+"""</dc:contributor>
+      <dc:subject>"""+subject+"""</dc:subject>
+      <dc:source>"""+source+"""</dc:source>
+      <dc:rights>"""+rights+"""</dc:rights>
+      <dc:relation>"""+relation+"""</dc:relation>
+      <dc:coverage>"""+coverage+"""</dc:coverage>
+      <dc:type>"""+resource_type+"""</dc:type>
+      <dc:identifier>"""+identifier+"""</dc:identifier>
+      <dc:description>"""+description+"""</dc:description>
+      <dc:publisher>"""+publisher+"""</dc:publisher>
+      <dc:date>"""+date+"""</dc:date>
+      <dc:format>"""+resource_format+"""</dc:format>
+      <dc:language>"""+language+"""</dc:language>
+    </rdf:Description>
+
+    </rdf:RDF>
+     </xs:appinfo>
+    </xs:annotation>
+
+
+
+    <xs:include schemaLocation="http://www.mlhim.org/xmls/mlhim2/2_3_1/mlhim2.xsd"/>
+
+    <xs:element name='"""+ccd_id+"""' type="mlhim2:"""+ccdct+""""/>
+
+    <xs:complexType name='"""+ccdct+"""'>
+      <xs:complexContent>
+      <xs:restriction base="mlhim2:CCDType">
+      <xs:sequence>
+        <xs:element name="definition"  minOccurs="1"  maxOccurs="1" type="mlhim2:"""+ect+""""/>
+      </xs:sequence>
+      </xs:restriction>
+      </xs:complexContent>
+    </xs:complexType>
+
+    <xs:complexType name='"""+ect+"""'>
+      <xs:complexContent>
+      <xs:restriction base="mlhim2:ElementType">
+      <xs:sequence>
+        <xs:element name="Element_dv"  minOccurs="1"  maxOccurs="1" type="mlhim2:"""+dct+""""/>
+      </xs:sequence>
+      </xs:restriction>
+      </xs:complexContent>
+    </xs:complexType>
+    """)
+
+    if dt == "DATE":
+        f.write("""
+    <xs:complexType name='"""+dct+"""'>
+      <xs:complexContent>
+      <xs:restriction base="mlhim2:DvDateType">
+      <xs:sequence>
+        <xs:element name="data_name"  minOccurs="1"  maxOccurs="1" type="xs:string" fixed='"""+data_name+"""'/>
+        <xs:element name="DvDate_dv"  minOccurs="1"  maxOccurs="1" type="xs:date"/>
+      </xs:sequence>
+      </xs:restriction>
+      </xs:complexContent>
+    </xs:complexType>
+        """)
+
+    elif dt == "TIME":
+        f.write("""
+    <xs:complexType name='"""+dct+"""'>
+      <xs:complexContent>
+      <xs:restriction base="mlhim2:DvTimeType">
+      <xs:sequence>
+        <xs:element name="data_name"  minOccurs="1"  maxOccurs="1" type="xs:string" fixed='"""+data_name+"""'/>
+        <xs:element name="DvTime_dv"  minOccurs="1"  maxOccurs="1" type="xs:time"/>
+      </xs:sequence>
+      </xs:restriction>
+      </xs:complexContent>
+    </xs:complexType>
+        """)
+
+    elif len(ccd_dict[k]) > 8:
+        f.write("""
+    <xs:complexType name='"""+dct+"""'>
+      <xs:complexContent>
+      <xs:restriction base="mlhim2:DvStringType">
+      <xs:sequence>
+        <xs:element name="data_name"  minOccurs="1"  maxOccurs="1" type="xs:string" fixed='"""+data_name+"""'/>
+        <xs:element name="DvString_dv"  minOccurs="1"  maxOccurs="1">
+            <xs:simpleType>
+              <xs:restriction base="xs:string">
+        """)
+        for n in range(8, len(ccd_dict[k])):
+            enum = (ccd_dict[k][n][0]).replace("'",'')
+            f.write("""       <xs:enumeration value='"""+enum+"""'/>
+            """)
+        f.write("""
+              </xs:restriction>
+            </xs:simpleType>
+        </xs:element>
+      </xs:sequence>
+      </xs:restriction>
+      </xs:complexContent>
+    </xs:complexType>
+        """)
+
+
+    else:
+
+        f.write("""
+    <xs:complexType name='"""+dct+"""'>
+      <xs:complexContent>
+      <xs:restriction base="mlhim2:DvStringType">
+      <xs:sequence>
+        <xs:element name="data_name"  minOccurs="1"  maxOccurs="1" type="xs:string" fixed='"""+data_name+"""'/>
+        <xs:element name="DvString_dv"  minOccurs="1"  maxOccurs="1" type="xs:string"/>
+      </xs:sequence>
+      </xs:restriction>
+      </xs:complexContent>
+    </xs:complexType>
+        """)
+
+    f.write("""
+    </xs:schema>""")
+
+    f.close()
+
+
+
+
+
+