I'm getting an XML file from an application, want to extract some information from that file and write it to an Excel file. The XML file contains data about vacations. I need data from the last month with an approval status 1, 5 or 6.
The following function returns the month and year:
def set_month():
year = time.localtime()[0]
month = time.localtime()[1]
if month == 1:
month = 12
year -= 1
else: month -= 1
return (year, month)
I assume there is a better way to do this.
Below you'll find the complete script and an example XML file. I'd be happy about any ideas to improve the code.
#!/usr/bin/env python3
# file: urlaub.py
# desc: Reads a xml file created from another software. This file contains
# vacation requests. This program search all vacation requests for the month
# ago, grabs the relevant informations and write they into an excel file.
#
# possible exit status if the program does not crash:
# 0: all clear
# 1: you hav a double character in your wantedInfos directory
# 2: failed to read the xml file or to get the tree root
# 3: failed to write the excel file
import time
import logging
from sys import exit
import datetime as dt
from openpyxl import Workbook as WB
from openpyxl.styles import Font
import xml.etree.ElementTree as ET
# --- logging --- #
def configure_logging(loglevel):
'''
Configure logging settings.
param 1: integer
'''
formatstring = "%(levelname)-8s %(message)s"
logging.basicConfig(format=formatstring, level=loglevel)
logging.info("Loglevel set to {}".format(loglevel))
# --- misk --- #
def check_doubles():
'''
Checks for double (column) characters in the wantedInfos dictionary.
'''
char_list = []
logging.debug("Check for double cell characters")
for dictionary in (wantedInfos, interestedAbsence):
for key in dictionary.keys():
character = dictionary[key]["Col"]
if character in char_list:
logging.error("Double cell character found: {}".format(character))
exit(1)
char_list.append(character)
logging.debug("Character list {}".format(sorted(char_list)))
# --- xml processing --- #
def set_month():
'''
Searchs for the month ago. Returns year and month as string.
returns: tuple of strings
'''
year = time.localtime()[0]
month = time.localtime()[1]
if month == 1:
month = 12
year -= 1
else: month -= 1
logging.info("Search for vacation requests in {} {}".format(month, year))
return (year, month)
def absence_is_relevant(absence):
'''
Checks if the given absence is relevant. This includes an check for time
and also for approval status.
param 1: xml element
returns: boolean
'''
start = absence.find(get_path(["Beginning"]))
date = start.text.split("T")
logging.debug("Test absence for {}".format(date))
year, month, day = date[0].split("-")
if int(year) == wantedYear and int(month) == wantedMonth:
logging.debug("Absence is relevant")
status = absence.find(get_path(["ApprovalStatus"]))
if int(status.text) in (0, 2, 4):
logging.debug("Status is {} ... skip.".format(status.text))
return False
return True
logging.debug("Absence is not relevant")
return False
def request_is_relevant(request):
'''
Checks if the given request is relevant. Walks through the absence list
and calls absence_is_relevant for every absence found.
param 1: xml tree element
returns: boolean
'''
guid_main = request.find(get_path(["GUID"]))
name = request.find(get_path(["RequestorFullName"]))
logging.debug("Test for relevanz: {}".format(guid_main.text))
for absence in request.findall(get_path(["AbsenceList", "Absence"])):
if absence_is_relevant(absence) is True:
logging.debug("Request is relevant")
return True
logging.debug("Request is not relevant")
return False
def get_path(part_list):
'''
Because i cannot access the data only with the tag name, we add the
namespace for every tree level.
param 1: list
returns: string
'''
parts = []
for tag in part_list:
parts.append(get_full_tag(tag))
path = "/".join(parts)
return path
def get_full_tag(tag):
'''
Adds the namespace to an single xml tag.
param 1: string
returns: string
'''
return "".join((ns, tag))
def crop_namespace(node):
'''
param 1: string
returns: string or none
'''
if node is None:
return None
return node.tag.split("}")[1].strip()
def process_request(request):
'''
Processes the given request. We walk two times through all keys from the
wantedInfos dictionary. First time we collect all data for keys flagged
with "Abs" is false. Second time we walk through all branches from
absence list. If the absence is relevant, we collect all data for keys
flagged with true. These data will be stored in a subdictionary with
absenceCounter as key.
param 1: xml-tree object
returns: dictionary
'''
currentRequest: dict = {'AbsenceList': {}, }
relevantCounter = 0
absenceCounter = 0
guid_main = request.find(get_path(["GUID"]))
name = request.find(get_path(["RequestorFullName"]))
logging.debug("Processing {}: {}".format(guid_main.text, name.text))
# collect request level information
for key in wantedInfos.keys():
if wantedInfos[key]['Abs'] is False:
path = get_path(wantedInfos[key]["Path"])
logging.debug("Search for key '{}' '{}'".format(key, path))
node = request.find(path)
logging.debug("Found node '{}'".format(crop_namespace(node)))
if node is None:
value = None
else:
value = node.text
currentRequest[key] = value
logging.debug("Store: key: {}, value: '{}'".format(key, value))
# walk through absence list and collect absence level informations
for absence in request.findall(get_path(["AbsenceList", "Absence"])):
absenceCounter += 1
guid_abs = absence.find(get_path(["VacationRequestGUID"]))
logging.debug("{}. Abs: {}".format(absenceCounter, guid_abs.text))
if absence_is_relevant(absence) is False:
logging.debug("Skip absence")
continue
logging.debug("Processing absence")
relevantCounter += 1
currentRequest['AbsenceList'][relevantCounter] = {}
for key in wantedInfos.keys():
if wantedInfos[key]['Abs'] is True:
path = get_path(wantedInfos[key]['Path'])
value = absence.find(path).text
currentRequest['AbsenceList'][relevantCounter][key] = value
logging.debug("Store: key: '{}', value: '{}'".format(value, key))
return currentRequest
def xml_to_list():
'''
Collects the wanted informations from a xml tree and store these data in
a list of dictionarys.
returns: list of dictinaries
'''
counter = 0
relevant = 0
allVacations = []
# read the xml tree from file and grab tree root
try:
tree = ET.parse(xmlfile)
logging.debug("Reading xml-file {} successful".format(xmlfile))
root = tree.getroot()
except Exception as error:
logging.error("Failed to get xml-tree root.")
logging.error(error)
exit(2)
# iterate over all vacation requests and check if the request is
# relevant.
for request in root.findall(get_full_tag("VacationRequests")):
counter += 1
logging.debug("* {}. New request found".format(counter))
guid = request.find(get_path(["GUID"]))
name = request.find(get_path(["RequestorFullName"]))
logging.debug("{}: {}".format(guid.text, name.text))
if request_is_relevant(request) is False:
continue
else:
# if request is relevant call function to grab all needed data.
relevant += 1
logging.debug("Relevant entry found: {}".format(relevant))
currentRequest = process_request(request)
allVacations.append(currentRequest)
logging.info("{} relevant requests found".format(len(allVacations)))
return(allVacations)
# --- excel --- #
def get_cell_name(character, number):
'''
Returns an string from column character and row number.
param 1: string
param 2: integer
returns: string
'''
return "".join((character.strip(), str(number).strip()))
def write_sheet_header(worksheet):
'''
Writes the first row in the worksheet. The cell text is the key from
wantedInfos. The column character comes from the keys "col" flag. We use
characters because worksheet.column_dimensions cannot work with integer.
param 1: worksheet object
'''
row = 1
boldFont = Font(name="Calibri", bold = True)
for key in wantedInfos.keys():
width = wantedInfos[key]["Width"]
column = wantedInfos[key]["Col"]
worksheet.column_dimensions[column].width = width
cell_name = get_cell_name(column, row)
cell = worksheet[cell_name]
cell.font = boldFont
worksheet[cell_name] = key
logging.debug("Column {} was set to width {}".format(key, width))
worksheet.freeze_panes = "ZZ2"
logging.debug("First row pinned")
def write_cell(worksheet, row, key, value):
'''
Writes "value" into a cell. The cell name is build from the wantedInfos
"Col" flag for "key" and row.
param 1: sheet object
param 2: integer
param 3: string
param 4: string
'''
# any values needs to convert into an readable format
if key == "Art":
value = absenceKindDict[value]
if key == "Tage":
value = int(int(value) / 2)
elif key in ("Freigabe1", "Freigabe2", "Freigabe"):
value = approvalStatusDict[value]
elif key in ("Start", "Ende", "Eingetr."):
value = dt.date.fromisoformat(value.split("T")[0])
# write
column = wantedInfos[key]['Col']
cell_name = get_cell_name(column, row)
worksheet[cell_name] = value
if key in ("Start", "Ende", "Eingetr."):
worksheet[cell_name].number_format = "dd.mm.yyyy"
logging.debug("{}: '{}' written".format(cell_name, value))
def write_excel(vacationList):
'''
Becomes the vacation requests list and writes there data into an excel
file.
param 1: list of dictionaries
'''
year = str(wantedYear)
if wantedMonth in range(1, 10):
month = "".join(("0", str(wantedMonth)))
else: month = str(wantedMonth)
fileName = "material/urlaub-{}-{}.xls".format(year, month)
sheetName = " ".join(("urlaub", year, month))
logging.info("Starting to write excel file {}".format(fileName))
workbook = WB()
workbook.iso_dates = True
worksheet = workbook.active
worksheet.title = sheetName
for i in hidden_column:
worksheet.column_dimensions[i].hidden = True
row = 1
# set width and write header
write_sheet_header(worksheet)
# process all requests
for request in allVacations:
row += 1
logging.debug("Write request into row {}: {} {}".format(row, \
request["GUID-R"], request["Name"]))
# write the main request data
for key in request.keys():
if key == "AbsenceList":
continue
else:
write_cell(worksheet, row, key, request[key])
# then we write the absence data
for absence_nr in request["AbsenceList"].keys():
if absence_nr > 1:
row += 1
guida = request["AbsenceList"][absence_nr]["GUID-A"]
logging.debug("Write absence into row {}: {} {}".format(row, \
absence_nr, guida))
for key in request["AbsenceList"][absence_nr].keys():
value = request["AbsenceList"][absence_nr][key]
write_cell(worksheet, row, key, value)
logging.info("{} absence request written.".format(row - 1))
try:
workbook.save(fileName)
logging.info("Excel file successful written")
except Exception as error:
logging.critical("Failed to write excel file")
exit(3)
# --- start program --- #
xmlfile = "material/urlaubsantraege.xml"
ns = "{urn:orkan-software.de:schema:ORKAN}"
wantedInfos = {
# "Eingetr.": {"Abs": False, "Col": "A", "Width": 12, "Path": ["DateAdd"]},
# "Kurzname": {"Abs": False, "Col": "B", "Width": 15, "Path": ["RequestorMaMatch"]},
"Name": {"Abs": False, "Col": "C", "Width": 25, "Path": ["RequestorFullName"]},
# "Tage": {"Abs": False, "Col": "D", "Width": 8, "Path": ["RequestedHalfDays"]},
#"Typ": {"Abs": True, "Col": "E", "Width": 8, "Path": ["AbsenceType"]},
# "Freigabe1": {"Abs": False, "Col": "E", "Width": 12, "Path": ["Approval1ApprovalStatus"]},
"Freigabe1-Name": {"Abs": False, "Col": "F", "Width": 16, "Path": ["Approval1MaMatch"]},
# "Freigabe2": {"Abs": False, "Col": "G", "Width": 12, "Path": ["Approval2ApprovalStatus"]},
"Freigabe2-Name": {"Abs": False, "Col": "H", "Width": 16, "Path": ["Approval2MaMatch"]},
"Freigabe": {"Abs": True, "Col": "I", "Width": 15, "Path": ["ApprovalStatus"]},
"Art": {"Abs": True, "Col": "J", "Width": 15, "Path": ["AbsenceKindGuid"]},
"Start": {"Abs": True, "Col": "K", "Width": 12, "Path": ["Beginning"]},
"Ende": {"Abs": True, "Col": "L", "Width": 12, "Path": ["Ending"]},
"GUID-A": {"Abs": True, "Col": "M", "Width": 45, "Path": ["VacationRequestGUID"]},
"GUID-R": {"Abs": False, "Col": "N", "Width": 45, "Path": ["GUID"]},
# "Grund": {"Abs": False, "Col": "O", "Width": 40, "Path": ["RejectionReason"]},
# "Kommentar": {"Abs": False, "Col": "P", "Width": 50, "Path": ["RequestorComment"]}
}
interestedAbsence: dict = {
}
absenceKindDict = {
"{953D3EA0-CB6A-477F-83F1-C1B4939BA0FD}": "Bildungsurlaub",
"{DECF84D7-ADD8-4785-AD55-662B8F70C158}": "Lehrgang HWK",
"{945EB227-B3E2-4EFF-831E-8D7D1A08281D}": "Resturlaub",
"{EFC603E1-5CAF-4153-ADF9-A035440E34CD}": "Sonderurlaub",
"{3F43AC39-6132-4397-B236-7D45F9E43019}": "Jahresurlaub"
}
approvalStatusDict = {
'0': "Nicht einger.",
'1': "Gelöscht (veraltet)",
'2': "Beantragt",
'3': "In Bearbeitung",
'4': "Abgelehnt",
'5': "Gebucht",
'6': "Gelöscht"
}
hidden_column = ("E", "G", "M", "N")
loglevel = logging.DEBUG
wantedMonth = None
wantedYear = None
# configure_logging(loglevel)
check_doubles()
wantedYear, wantedMonth = set_month()
allVacations = xml_to_list()
write_excel(allVacations)
and the XML file
<VacationRequestsList xmlns="urn:orkan-software.de:schema:ORKAN">
<VacationRequests>
<DateAdd>2023-01-22</DateAdd>
<RequestorMaMatch>ALPHA</RequestorMaMatch>
<ApprovalStatus>5</ApprovalStatus>
<FilingDateDate>2023-02-22</FilingDateDate>
<RequestedHalfDays>30</RequestedHalfDays>
<Approval1RcptTyp>0</Approval1RcptTyp>
<Approval1RcptMatch>LAMBDA</Approval1RcptMatch>
<Approval1ApprovalStatus>5</Approval1ApprovalStatus>
<Approval1TimeStampPutDate>2023-02-23</Approval1TimeStampPutDate>
<Approval1MaMatch>LAMBDA</Approval1MaMatch>
<Approval2ApprovalStatus>5</Approval2ApprovalStatus>
<Approval2TimeStampPutDate>2023-02-23</Approval2TimeStampPutDate>
<Approval2MaMatch>KAPPA</Approval2MaMatch>
<RequestorFullName>Christian Alpha</RequestorFullName>
<Approval1MaFullName>Daniel Lambda</Approval1MaFullName>
<Approval2MaFullName>Karsten Kappa</Approval2MaFullName>
<AbsenceList>
<Absence>
<DateAdd>2023-01-22</DateAdd>
<MaMatch>ALPHA</MaMatch>
<AbsenceType>1</AbsenceType>
<AbsenceAccountChange>10</AbsenceAccountChange>
<Origin>0</Origin>
<Beginning>2023-07-17T00:00:00.00+02:00</Beginning>
<BeginningDate>2023-07-17</BeginningDate>
<Ending>2023-07-21T23:59:59.99+02:00</Ending>
<EndingDate>07:231:07.21</EndingDate>
<ApprovalStatus>5</ApprovalStatus>
</Absence>
<Absence>
<DateAdd>2023-01-22</DateAdd>
<MaMatch>ALPHA</MaMatch>
<AbsenceType>1</AbsenceType>
<AbsenceAccountChange>10</AbsenceAccountChange>
<Origin>0</Origin>
<Beginning>2023-07-24T00:00:00.00+02:00</Beginning>
<BeginningDate>2023-07-24</BeginningDate>
<Ending>2023-07-28T23:59:59.99+02:00</Ending>
<EndingDate>07:231:07.28</EndingDate>
<ApprovalStatus>5</ApprovalStatus>
</Absence>
<Absence>
<DateAdd>2023-01-22</DateAdd>
<MaMatch>ALPHA</MaMatch>
<AbsenceType>1</AbsenceType>
<AbsenceAccountChange>10</AbsenceAccountChange>
<Origin>0</Origin>
<Beginning>2023-07-31T00:00:00.00+02:00</Beginning>
<BeginningDate>2023-07-31</BeginningDate>
<Ending>2023-08-04T23:59:59.99+02:00</Ending>
<EndingDate>07:231:08.04</EndingDate>
<ApprovalStatus>5</ApprovalStatus>
</Absence>
</AbsenceList>
</VacationRequests>
</VacationRequestsList>
```