Skip to content

Commit 05a5721

Browse files
Upload parse_payslips.py
0 parents  commit 05a5721

File tree

1 file changed

+346
-0
lines changed

1 file changed

+346
-0
lines changed

parse_payslips.py

Lines changed: 346 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,346 @@
1+
import os
2+
import subprocess
3+
import sys
4+
import re
5+
import pandas as pd
6+
from datetime import datetime
7+
import logging
8+
9+
# Setting up logger
10+
11+
logger = logging.getLogger(__name__)
12+
logger.setLevel(logging.DEBUG)
13+
14+
formatter = logging.Formatter("%(asctime)s:%(name)s:%(lineno)d:%(levelname)s:%(message)s")
15+
file_handler = logging.FileHandler(f'parse_payslip_info_log_{datetime.now().strftime("%d_%m_%Y__%H_%M_%S")}.log')
16+
file_handler.setFormatter(formatter)
17+
18+
stdout_handler = logging.StreamHandler(sys.stdout)
19+
stdout_handler.setFormatter(formatter)
20+
21+
logger.addHandler(file_handler)
22+
23+
if len(sys.argv) > 1:
24+
if "--verbose" in sys.argv or "-v" in sys.argv:
25+
logger.addHandler(stdout_handler)
26+
27+
28+
29+
30+
def convert_pdf_to_text():
31+
'''
32+
convert_pdf_to_text()
33+
34+
Converts all pdf files in the current directort to text file
35+
36+
Output directory: .\converted_pdfs
37+
38+
'''
39+
# directory to hold converted text files
40+
logger.info("Creating converted_pdfs directory")
41+
if os.system("mkdir converted_pdfs") != 0:
42+
logger.error("Failed to create converted_pdfs directory")
43+
sys.exit()
44+
45+
# list of pdf files in dir/sub-dir and save them to a text file
46+
logger.info("Gathering list of full path to pdf files in the current directory/sub-directory")
47+
os.system("dir /s /b *.pdf > allpdf.txt")
48+
49+
list_fnames = []
50+
51+
# put \n seperated file path in a list
52+
logger.info("Saving pdf file names to a list")
53+
try:
54+
with open('allpdf.txt', 'r') as fh:
55+
list_fnames = list(fh.read().split('\n'))
56+
except FileNotFoundError:
57+
logger.error("Unable to open file: addpdf.txt")
58+
sys.exit()
59+
60+
err_count = 0
61+
62+
63+
# converting files one by one
64+
logger.info("Generating text files from pdf")
65+
for fname in list_fnames:
66+
if fname:
67+
target_text_fname = f"{get_fname_without_ext(fname)}.txt"
68+
target_text_path = os.path.join('.\converted_pdfs',target_text_fname)
69+
ret = subprocess.run(['bin64\pdftotext.exe', fname, target_text_path], capture_output=True)
70+
if ret.returncode != 0:
71+
logger.error(f"Error converting: {target_text_path}")
72+
err_count += 1
73+
# saving list of converted text files
74+
logger.info("Gathering list of text file names")
75+
os.system("dir converted_pdfs\*.txt /b > alltexts.txt")
76+
77+
return err_count
78+
79+
def get_fname_without_ext(fname):
80+
'''
81+
get_fname_without_ext(fname)
82+
83+
Returns the filename (without extension) from a filepath
84+
85+
Ex: Return 'ebook' from d:\pdffiles\ebook.pdf
86+
87+
'''
88+
match = ""
89+
#pattern = re.compile(r'(Payslip_.+)(.pdf)')
90+
match = re.search(r'(Payslip_.+)(.pdf)',fname)
91+
if match:
92+
return match.group(1)
93+
else:
94+
return ""
95+
96+
def get_list_of_converted_files():
97+
98+
'''
99+
get_list_of_converted_files()
100+
101+
Returns list of full path of converted text files
102+
103+
'''
104+
105+
list_text_fnames = []
106+
107+
def append_path(fname):
108+
return os.path.join(".\converted_pdfs", fname)
109+
110+
logger.info("Reading alltexts.txt, appending converted_pdfs directory name")
111+
try:
112+
with open("alltexts.txt", 'r') as fh:
113+
list_text_fnames = list(fh.read().split('\n'))
114+
except FileNotFoundError:
115+
logger.error("alltexts.txt not found")
116+
sys.exit()
117+
118+
return list(map(append_path, list_text_fnames))
119+
120+
def format_number_str(s):
121+
'''
122+
format_number_str(s)
123+
124+
Converts number string to float
125+
126+
Ex: 2,345.00 to 2345.00
127+
128+
'''
129+
if s != "":
130+
return float(s.replace(",", "").replace(" ", ""))
131+
else:
132+
return 0
133+
134+
def month_no_to_name(mnum):
135+
'''
136+
month_no_to_name(mnum)
137+
138+
Returns 3 letter month name from month number
139+
140+
Ex: 1 -> Jan, 2 -> Feb, 12 -> Dec
141+
142+
'''
143+
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
144+
return months[mnum-1]
145+
146+
147+
class Payslip:
148+
'''
149+
Payslip class
150+
151+
Contains methods to extract payslip details from a converted text file
152+
153+
'''
154+
155+
def __init__(self):
156+
self.pay_period = ""
157+
self.pay_date = ""
158+
self.epf_no = ""
159+
self.uan_no = ""
160+
self.basic_salary = ""
161+
self.gross_salary = ""
162+
self.net_salary = ""
163+
self.gross_salary_ytd = ""
164+
self.pf_amount = ""
165+
self.pf_ytd = ""
166+
self.income_tax = ""
167+
self.income_tax_ytd = ""
168+
self.raw_payslip_text = ""
169+
170+
def read_text(self, fname):
171+
try:
172+
with open(fname, 'r') as fh:
173+
self.raw_payslip_text = fh.read()
174+
self.pay_period = self.get_pay_period()
175+
self.pay_date = self.get_pay_date()
176+
self.epf_no = self.get_epf_number()
177+
self.uan_no = self.get_uan_number()
178+
self.basic_salary = self.get_basic_salary()
179+
self.gross_salary = self.get_gross_sal()
180+
self.net_salary = self.get_net_sal()
181+
self.gross_salary_ytd =self.get_gross_sal_ytd()
182+
self.pf_amount =self.get_pf()
183+
self.pf_ytd = self.get_pf_ytd()
184+
self.income_tax = self.get_income_tax()
185+
self.income_tax_ytd = self.get_income_tax_ytd()
186+
187+
except FileNotFoundError:
188+
logger.error(f"File not found: {fname}")
189+
190+
def get_pay_period(self):
191+
match = re.search(r'Pay\sPeriod\s:\s?([\d.]+[\s\-]+[\d.]+)', self.raw_payslip_text)
192+
if match:
193+
return match.group(1)
194+
else:
195+
return ""
196+
197+
def get_pay_date(self):
198+
match = re.search(r'Pay\sDate\n\n:\s?([\d+.]+)', self.raw_payslip_text)
199+
if match:
200+
return match.group(1)
201+
else:
202+
return ""
203+
204+
def get_epf_number(self):
205+
match = re.search(r'Emp\sPF\sNumber:\s?([\w\/]+)', self.raw_payslip_text)
206+
if match:
207+
return match.group(1)
208+
else:
209+
return ""
210+
211+
def get_uan_number(self):
212+
match = re.search(r'UAN[\n]+:\s?(\d+)', self.raw_payslip_text)
213+
if match:
214+
return match.group(1)
215+
else:
216+
return ""
217+
218+
def get_basic_salary(self):
219+
match = re.search(r'Basic\sSalary\n+([\d,.]+)', self.raw_payslip_text)
220+
if match:
221+
return match.group(1)
222+
else:
223+
return ""
224+
225+
def get_gross_sal(self):
226+
match = re.search(r'Total\sGross\n+([\d,.]+)', self.raw_payslip_text)
227+
if match:
228+
return match.group(1)
229+
else:
230+
return ""
231+
232+
def get_net_sal(self):
233+
match = re.search(r'NET\sPAY\n+([\d,.]+)', self.raw_payslip_text)
234+
if match:
235+
return match.group(1)
236+
else:
237+
return ""
238+
239+
def get_gross_sal_ytd(self):
240+
match = re.search(r'YTD\sGROSS\n+([\d,.]+)', self.raw_payslip_text)
241+
if match:
242+
return match.group(1)
243+
else:
244+
return ""
245+
246+
def get_pf(self):
247+
match = re.search(r'Provident\sFund\n+([\d.,]+)', self.raw_payslip_text)
248+
if match:
249+
return match.group(1)
250+
else:
251+
return ""
252+
253+
def get_pf_ytd(self):
254+
match = re.search(r'YTD\sEmployee\sPF\n+([\d.,]+)', self.raw_payslip_text)
255+
if match:
256+
return match.group(1)
257+
else:
258+
return ""
259+
260+
def get_income_tax(self):
261+
match = re.search(r'Income\sTax\n+([\d,.]+)', self.raw_payslip_text)
262+
if match:
263+
return match.group(1)
264+
else:
265+
return ""
266+
267+
def get_income_tax_ytd(self):
268+
match = re.search(r'YTD\sTAX\n+([\d,.]+)', self.raw_payslip_text)
269+
if match:
270+
return match.group(1)
271+
else:
272+
return ""
273+
274+
275+
276+
payslip_details = {
277+
"pay_period": [],
278+
"pay_date": [],
279+
"basic_salary": [],
280+
"gross_salary": [],
281+
"net_salary": [],
282+
"gross_salary_ytd": [],
283+
"pf_amount": [],
284+
"pf_ytd": [],
285+
"income_tax": [],
286+
"income_tax_ytd": [],
287+
"epf_no": [],
288+
"uan_no": [],
289+
}
290+
291+
logger.info("Process started")
292+
logger.info("Converting pdf to text")
293+
convert_pdf_to_text()
294+
295+
list_txt_fnames = get_list_of_converted_files()
296+
297+
pay = Payslip()
298+
299+
logger.info("Saving payslip details from each text file to dictionary")
300+
# Saving payslip details from each text file to dictionary
301+
for fname in list_txt_fnames:
302+
pay.read_text(fname)
303+
payslip_details["pay_period"].append(pay.pay_period)
304+
payslip_details["pay_date"].append(pay.pay_date)
305+
payslip_details["epf_no"].append(pay.epf_no)
306+
payslip_details["uan_no"].append(pay.uan_no)
307+
payslip_details["basic_salary"].append(pay.basic_salary)
308+
payslip_details["gross_salary"].append(pay.gross_salary)
309+
payslip_details["net_salary"].append(pay.net_salary)
310+
payslip_details["gross_salary_ytd"].append(pay.gross_salary_ytd)
311+
payslip_details["pf_amount"].append(pay.pf_amount)
312+
payslip_details["pf_ytd"].append(pay.pf_ytd)
313+
payslip_details["income_tax"].append(pay.income_tax)
314+
payslip_details["income_tax_ytd"].append(pay.income_tax_ytd)
315+
316+
logger.info("Creating dataframe from dictionary")
317+
# creating dataframe from dictionary
318+
pay_df = pd.DataFrame.from_dict(payslip_details)
319+
320+
logger.info("Formatting columns containing numeric data")
321+
# Formatting columns containing numeric data
322+
# converting object to float
323+
pay_df['basic_salary'] = pay_df['basic_salary'].apply(lambda x: format_number_str(x))
324+
pay_df['net_salary'] = pay_df['net_salary'].apply(lambda x: format_number_str(x))
325+
pay_df['gross_salary'] = pay_df['gross_salary'].apply(lambda x: format_number_str(x))
326+
pay_df['gross_salary_ytd'] = pay_df['gross_salary_ytd'].apply(lambda x: format_number_str(x))
327+
pay_df['pf_amount'] = pay_df['pf_amount'].apply(lambda x: format_number_str(x))
328+
pay_df['pf_ytd'] = pay_df['pf_ytd'].apply(lambda x: format_number_str(x))
329+
pay_df['income_tax'] = pay_df['income_tax'].apply(lambda x: format_number_str(x))
330+
pay_df['income_tax_ytd'] = pay_df['income_tax_ytd'].apply(lambda x: format_number_str(x))
331+
332+
logger.info("Creating Series to hold year and month")
333+
# series to hold month and year
334+
years = pay_df['pay_date'].apply(lambda x: re.sub(r'\d+.\d+.(\d{4})', r'\1',x))
335+
months = pay_df['pay_date'].apply(lambda x: month_no_to_name(int(re.sub(r'\d+.(\d+).\d{4}', r'\1',x))))
336+
337+
logger.info("Appending year and month column to the start")
338+
# appending year and month to the start
339+
pay_df.insert(0,'year',years)
340+
pay_df.insert(1,'months',months)
341+
342+
logger.info("Exporting to Excel")
343+
# exporting to Excel
344+
pay_df.to_excel("payslips.xlsx", index=False)
345+
346+
logger.info("Process completed successfully")

0 commit comments

Comments
 (0)