From 9506a106f464503d2e14d24616f283db6cfe58cf Mon Sep 17 00:00:00 2001 From: Guillaume Raffy Date: Fri, 27 Jan 2023 14:31:11 +0100 Subject: [PATCH] =?UTF-8?q?extraction=20des=20achats=20informatiques=20?= =?UTF-8?q?=C3=A0=20partir=20de=20commandes-2019-cnrs-t002.tsv=20:=20c'est?= =?UTF-8?q?=20beaucoup=20plus=20complet?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 5 ++++ src/main.py | 71 +++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 71 insertions(+), 5 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a6204a1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +tmp +*.xls +*.xlsx +.~lock* +*.docx diff --git a/src/main.py b/src/main.py index aaaf357..9a764bc 100644 --- a/src/main.py +++ b/src/main.py @@ -4,7 +4,8 @@ import re import pandas -def cnrsformat1_to_sheet(in_tsv_file_path: Path, out_tsv_file_path: Path): +# converts a cnrs geslab type t001 report to a single table +def geslabt001_to_sheet(in_tsv_file_path: Path, out_tsv_file_path: Path): with open(in_tsv_file_path) as inf, open(out_tsv_file_path, 'wt') as outf: table_header_has_been_written = False for line in inf.readlines(): @@ -25,14 +26,36 @@ def cnrsformat1_to_sheet(in_tsv_file_path: Path, out_tsv_file_path: Path): outf.write(line) -def main(): - cnrsformat1_file_path = Path('./from-cloud.ipr/2019/commandes-2019-cnrs-t001.tsv') +# converts a cnrs geslab type t001 report to a single table +def geslabt002_to_sheet(in_tsv_file_path: Path, out_tsv_file_path: Path): + with open(in_tsv_file_path) as inf, open(out_tsv_file_path, 'wt') as outf: + table_header_has_been_written = False + for line in inf.readlines(): + # Entité dépensière : AESJULLIEN AES RENNES METROPOLE MC JULLIEN Crédits reçus : 40,000.00 + # Disponible : 24,743.14 + # + # + # N° commande Souche Libellé commande Date commande Raison sociale fournisseur Montant consommé sur exercice antérieur Montant consommé sur l'exercice Montant réservé Montant facturé Code origine Nature dépense Statut Cde groupée + is_table_header = re.match(r'^N° com. GESLAB', line) is not None + # for some strange reason, the column 'N° com. GESLAB''s contents are alternatively something like '1952-12-17 12:00:00 AM' and something like '19,855.00' + if is_table_header and not table_header_has_been_written: + outf.write('# %s' % line) + table_header_has_been_written = True + if re.match(r'^[0-9,.]+\t', line): + outf.write(line) + elif re.match(r'^[0-9][0-9][0-9][0-9]-[0-9]+-[0-9]+ [0-9][0-9]:[0-9][0-9]:[0-9][0-9] [AP]M\t', line): + outf.write(line) + else: + print('ignoring line : %s' % line) + + +def geslabt001_to_itorders(geslabt001_file_path: Path, itorders_file_path: Path): sheet_file_path = Path('./tmp/commandes-2019-cnrs.tsv') - cnrsformat1_to_sheet(cnrsformat1_file_path, sheet_file_path) + geslabt001_to_sheet(geslabt001_file_path, sheet_file_path) df = pandas.read_csv(sheet_file_path, sep='\t') - # delete the colums for which the labve is of the form 'Unnamed: '. They come from the csv export of libre office + # delete the colums for which the label is of the form 'Unnamed: '. They come from the csv export of libre office unnamed_columns = [column_label for column_label in df.keys() if re.match(r'^Unnamed', column_label) is not None] print(unnamed_columns) df = df.drop(columns=unnamed_columns) @@ -48,6 +71,44 @@ def main(): print(it_df) print(it_df[['Montant facturé', 'Raison sociale fournisseur', 'Libellé commande']]) + it_df.to_csv(itorders_file_path, sep='\t') + + +def geslabt002_to_itorders(geslabt001_file_path: Path, itorders_file_path: Path): + sheet_file_path = Path('./tmp/commandes-2019-cnrs.tsv') + geslabt002_to_sheet(geslabt001_file_path, sheet_file_path) + + df = pandas.read_csv(sheet_file_path, sep='\t') + + # delete the colums for which the labve is of the form 'Unnamed: '. They come from the csv export of libre office + unnamed_columns = [column_label for column_label in df.keys() if re.match(r'^Unnamed', column_label) is not None] + print(unnamed_columns) + df = df.drop(columns=unnamed_columns) + + print(df.columns) + print(df.keys()) + print(df) + PETIT_MATERIEL_INFORMATIQUE = '1100' + EQUIPEMENT_INFORMATIQUE = '2100' + INFORMATIQUE_ACHAT = 'D3--' + it_df = df[(df['Matière'] == PETIT_MATERIEL_INFORMATIQUE) | (df['Matière'] == EQUIPEMENT_INFORMATIQUE) | (df['Matière'] == INFORMATIQUE_ACHAT)] + print(it_df) + + # to remove clutter, drop the columns that we don't need + print(it_df.keys()) + it_df = it_df.drop(columns=['# N° com. GESLAB']) # this column seems to contain anything but ordering number + it_df = it_df.drop(columns=['N° ligne']) # I don't know the meaning of this column + it_df = it_df.drop(columns=['Code origine']) # I don't know the meaning of this column + it_df = it_df.drop(columns=['Elément analytique']) # I don't know the meaning of this column + it_df = it_df.drop(columns=['S']) # I don't know the meaning of this column + + print(it_df[['Facturé ligne', 'Raison sociale fournisseur', 'Libellé ligne']]) + it_df.to_csv(itorders_file_path, sep='\t') + + +def main(): + geslabt001_to_itorders(Path('./achats-ipr/2019/commandes-2019-cnrs-t001.tsv'), Path('./tmp/commandes-it-2019-cnrs-001.tsv')) + geslabt002_to_itorders(Path('./achats-ipr/2019/commandes-2019-cnrs-t002.tsv'), Path('./tmp/commandes-it-2019-cnrs-002.tsv')) main()