1+ import os
2+ import subprocess
3+ import sys
4+ import re
5+ import pandas as pd
6+ from datetime import datetime
7+ import logging
8+
9+ # Setting up logger
10+
11+ logger = logging .getLogger (__name__ )
12+ logger .setLevel (logging .DEBUG )
13+
14+ formatter = logging .Formatter ("%(asctime)s:%(name)s:%(lineno)d:%(levelname)s:%(message)s" )
15+ file_handler = logging .FileHandler (f'parse_payslip_info_log_{ datetime .now ().strftime ("%d_%m_%Y__%H_%M_%S" )} .log' )
16+ file_handler .setFormatter (formatter )
17+
18+ stdout_handler = logging .StreamHandler (sys .stdout )
19+ stdout_handler .setFormatter (formatter )
20+
21+ logger .addHandler (file_handler )
22+
23+ if len (sys .argv ) > 1 :
24+ if "--verbose" in sys .argv or "-v" in sys .argv :
25+ logger .addHandler (stdout_handler )
26+
27+
28+
29+
30+ def convert_pdf_to_text ():
31+ '''
32+ convert_pdf_to_text()
33+
34+ Converts all pdf files in the current directort to text file
35+
36+ Output directory: .\converted_pdfs
37+
38+ '''
39+ # directory to hold converted text files
40+ logger .info ("Creating converted_pdfs directory" )
41+ if os .system ("mkdir converted_pdfs" ) != 0 :
42+ logger .error ("Failed to create converted_pdfs directory" )
43+ sys .exit ()
44+
45+ # list of pdf files in dir/sub-dir and save them to a text file
46+ logger .info ("Gathering list of full path to pdf files in the current directory/sub-directory" )
47+ os .system ("dir /s /b *.pdf > allpdf.txt" )
48+
49+ list_fnames = []
50+
51+ # put \n seperated file path in a list
52+ logger .info ("Saving pdf file names to a list" )
53+ try :
54+ with open ('allpdf.txt' , 'r' ) as fh :
55+ list_fnames = list (fh .read ().split ('\n ' ))
56+ except FileNotFoundError :
57+ logger .error ("Unable to open file: addpdf.txt" )
58+ sys .exit ()
59+
60+ err_count = 0
61+
62+
63+ # converting files one by one
64+ logger .info ("Generating text files from pdf" )
65+ for fname in list_fnames :
66+ if fname :
67+ target_text_fname = f"{ get_fname_without_ext (fname )} .txt"
68+ target_text_path = os .path .join ('.\converted_pdfs' ,target_text_fname )
69+ ret = subprocess .run (['bin64\pdftotext.exe' , fname , target_text_path ], capture_output = True )
70+ if ret .returncode != 0 :
71+ logger .error (f"Error converting: { target_text_path } " )
72+ err_count += 1
73+ # saving list of converted text files
74+ logger .info ("Gathering list of text file names" )
75+ os .system ("dir converted_pdfs\*.txt /b > alltexts.txt" )
76+
77+ return err_count
78+
79+ def get_fname_without_ext (fname ):
80+ '''
81+ get_fname_without_ext(fname)
82+
83+ Returns the filename (without extension) from a filepath
84+
85+ Ex: Return 'ebook' from d:\pdffiles\ebook.pdf
86+
87+ '''
88+ match = ""
89+ #pattern = re.compile(r'(Payslip_.+)(.pdf)')
90+ match = re .search (r'(Payslip_.+)(.pdf)' ,fname )
91+ if match :
92+ return match .group (1 )
93+ else :
94+ return ""
95+
96+ def get_list_of_converted_files ():
97+
98+ '''
99+ get_list_of_converted_files()
100+
101+ Returns list of full path of converted text files
102+
103+ '''
104+
105+ list_text_fnames = []
106+
107+ def append_path (fname ):
108+ return os .path .join (".\converted_pdfs" , fname )
109+
110+ logger .info ("Reading alltexts.txt, appending converted_pdfs directory name" )
111+ try :
112+ with open ("alltexts.txt" , 'r' ) as fh :
113+ list_text_fnames = list (fh .read ().split ('\n ' ))
114+ except FileNotFoundError :
115+ logger .error ("alltexts.txt not found" )
116+ sys .exit ()
117+
118+ return list (map (append_path , list_text_fnames ))
119+
120+ def format_number_str (s ):
121+ '''
122+ format_number_str(s)
123+
124+ Converts number string to float
125+
126+ Ex: 2,345.00 to 2345.00
127+
128+ '''
129+ if s != "" :
130+ return float (s .replace ("," , "" ).replace (" " , "" ))
131+ else :
132+ return 0
133+
134+ def month_no_to_name (mnum ):
135+ '''
136+ month_no_to_name(mnum)
137+
138+ Returns 3 letter month name from month number
139+
140+ Ex: 1 -> Jan, 2 -> Feb, 12 -> Dec
141+
142+ '''
143+ months = ['Jan' , 'Feb' , 'Mar' , 'Apr' , 'May' , 'Jun' , 'Jul' , 'Aug' , 'Sep' , 'Oct' , 'Nov' , 'Dec' ]
144+ return months [mnum - 1 ]
145+
146+
147+ class Payslip :
148+ '''
149+ Payslip class
150+
151+ Contains methods to extract payslip details from a converted text file
152+
153+ '''
154+
155+ def __init__ (self ):
156+ self .pay_period = ""
157+ self .pay_date = ""
158+ self .epf_no = ""
159+ self .uan_no = ""
160+ self .basic_salary = ""
161+ self .gross_salary = ""
162+ self .net_salary = ""
163+ self .gross_salary_ytd = ""
164+ self .pf_amount = ""
165+ self .pf_ytd = ""
166+ self .income_tax = ""
167+ self .income_tax_ytd = ""
168+ self .raw_payslip_text = ""
169+
170+ def read_text (self , fname ):
171+ try :
172+ with open (fname , 'r' ) as fh :
173+ self .raw_payslip_text = fh .read ()
174+ self .pay_period = self .get_pay_period ()
175+ self .pay_date = self .get_pay_date ()
176+ self .epf_no = self .get_epf_number ()
177+ self .uan_no = self .get_uan_number ()
178+ self .basic_salary = self .get_basic_salary ()
179+ self .gross_salary = self .get_gross_sal ()
180+ self .net_salary = self .get_net_sal ()
181+ self .gross_salary_ytd = self .get_gross_sal_ytd ()
182+ self .pf_amount = self .get_pf ()
183+ self .pf_ytd = self .get_pf_ytd ()
184+ self .income_tax = self .get_income_tax ()
185+ self .income_tax_ytd = self .get_income_tax_ytd ()
186+
187+ except FileNotFoundError :
188+ logger .error (f"File not found: { fname } " )
189+
190+ def get_pay_period (self ):
191+ match = re .search (r'Pay\sPeriod\s:\s?([\d.]+[\s\-]+[\d.]+)' , self .raw_payslip_text )
192+ if match :
193+ return match .group (1 )
194+ else :
195+ return ""
196+
197+ def get_pay_date (self ):
198+ match = re .search (r'Pay\sDate\n\n:\s?([\d+.]+)' , self .raw_payslip_text )
199+ if match :
200+ return match .group (1 )
201+ else :
202+ return ""
203+
204+ def get_epf_number (self ):
205+ match = re .search (r'Emp\sPF\sNumber:\s?([\w\/]+)' , self .raw_payslip_text )
206+ if match :
207+ return match .group (1 )
208+ else :
209+ return ""
210+
211+ def get_uan_number (self ):
212+ match = re .search (r'UAN[\n]+:\s?(\d+)' , self .raw_payslip_text )
213+ if match :
214+ return match .group (1 )
215+ else :
216+ return ""
217+
218+ def get_basic_salary (self ):
219+ match = re .search (r'Basic\sSalary\n+([\d,.]+)' , self .raw_payslip_text )
220+ if match :
221+ return match .group (1 )
222+ else :
223+ return ""
224+
225+ def get_gross_sal (self ):
226+ match = re .search (r'Total\sGross\n+([\d,.]+)' , self .raw_payslip_text )
227+ if match :
228+ return match .group (1 )
229+ else :
230+ return ""
231+
232+ def get_net_sal (self ):
233+ match = re .search (r'NET\sPAY\n+([\d,.]+)' , self .raw_payslip_text )
234+ if match :
235+ return match .group (1 )
236+ else :
237+ return ""
238+
239+ def get_gross_sal_ytd (self ):
240+ match = re .search (r'YTD\sGROSS\n+([\d,.]+)' , self .raw_payslip_text )
241+ if match :
242+ return match .group (1 )
243+ else :
244+ return ""
245+
246+ def get_pf (self ):
247+ match = re .search (r'Provident\sFund\n+([\d.,]+)' , self .raw_payslip_text )
248+ if match :
249+ return match .group (1 )
250+ else :
251+ return ""
252+
253+ def get_pf_ytd (self ):
254+ match = re .search (r'YTD\sEmployee\sPF\n+([\d.,]+)' , self .raw_payslip_text )
255+ if match :
256+ return match .group (1 )
257+ else :
258+ return ""
259+
260+ def get_income_tax (self ):
261+ match = re .search (r'Income\sTax\n+([\d,.]+)' , self .raw_payslip_text )
262+ if match :
263+ return match .group (1 )
264+ else :
265+ return ""
266+
267+ def get_income_tax_ytd (self ):
268+ match = re .search (r'YTD\sTAX\n+([\d,.]+)' , self .raw_payslip_text )
269+ if match :
270+ return match .group (1 )
271+ else :
272+ return ""
273+
274+
275+
276+ payslip_details = {
277+ "pay_period" : [],
278+ "pay_date" : [],
279+ "basic_salary" : [],
280+ "gross_salary" : [],
281+ "net_salary" : [],
282+ "gross_salary_ytd" : [],
283+ "pf_amount" : [],
284+ "pf_ytd" : [],
285+ "income_tax" : [],
286+ "income_tax_ytd" : [],
287+ "epf_no" : [],
288+ "uan_no" : [],
289+ }
290+
291+ logger .info ("Process started" )
292+ logger .info ("Converting pdf to text" )
293+ convert_pdf_to_text ()
294+
295+ list_txt_fnames = get_list_of_converted_files ()
296+
297+ pay = Payslip ()
298+
299+ logger .info ("Saving payslip details from each text file to dictionary" )
300+ # Saving payslip details from each text file to dictionary
301+ for fname in list_txt_fnames :
302+ pay .read_text (fname )
303+ payslip_details ["pay_period" ].append (pay .pay_period )
304+ payslip_details ["pay_date" ].append (pay .pay_date )
305+ payslip_details ["epf_no" ].append (pay .epf_no )
306+ payslip_details ["uan_no" ].append (pay .uan_no )
307+ payslip_details ["basic_salary" ].append (pay .basic_salary )
308+ payslip_details ["gross_salary" ].append (pay .gross_salary )
309+ payslip_details ["net_salary" ].append (pay .net_salary )
310+ payslip_details ["gross_salary_ytd" ].append (pay .gross_salary_ytd )
311+ payslip_details ["pf_amount" ].append (pay .pf_amount )
312+ payslip_details ["pf_ytd" ].append (pay .pf_ytd )
313+ payslip_details ["income_tax" ].append (pay .income_tax )
314+ payslip_details ["income_tax_ytd" ].append (pay .income_tax_ytd )
315+
316+ logger .info ("Creating dataframe from dictionary" )
317+ # creating dataframe from dictionary
318+ pay_df = pd .DataFrame .from_dict (payslip_details )
319+
320+ logger .info ("Formatting columns containing numeric data" )
321+ # Formatting columns containing numeric data
322+ # converting object to float
323+ pay_df ['basic_salary' ] = pay_df ['basic_salary' ].apply (lambda x : format_number_str (x ))
324+ pay_df ['net_salary' ] = pay_df ['net_salary' ].apply (lambda x : format_number_str (x ))
325+ pay_df ['gross_salary' ] = pay_df ['gross_salary' ].apply (lambda x : format_number_str (x ))
326+ pay_df ['gross_salary_ytd' ] = pay_df ['gross_salary_ytd' ].apply (lambda x : format_number_str (x ))
327+ pay_df ['pf_amount' ] = pay_df ['pf_amount' ].apply (lambda x : format_number_str (x ))
328+ pay_df ['pf_ytd' ] = pay_df ['pf_ytd' ].apply (lambda x : format_number_str (x ))
329+ pay_df ['income_tax' ] = pay_df ['income_tax' ].apply (lambda x : format_number_str (x ))
330+ pay_df ['income_tax_ytd' ] = pay_df ['income_tax_ytd' ].apply (lambda x : format_number_str (x ))
331+
332+ logger .info ("Creating Series to hold year and month" )
333+ # series to hold month and year
334+ years = pay_df ['pay_date' ].apply (lambda x : re .sub (r'\d+.\d+.(\d{4})' , r'\1' ,x ))
335+ months = pay_df ['pay_date' ].apply (lambda x : month_no_to_name (int (re .sub (r'\d+.(\d+).\d{4}' , r'\1' ,x ))))
336+
337+ logger .info ("Appending year and month column to the start" )
338+ # appending year and month to the start
339+ pay_df .insert (0 ,'year' ,years )
340+ pay_df .insert (1 ,'months' ,months )
341+
342+ logger .info ("Exporting to Excel" )
343+ # exporting to Excel
344+ pay_df .to_excel ("payslips.xlsx" , index = False )
345+
346+ logger .info ("Process completed successfully" )
0 commit comments