From 9506a106f464503d2e14d24616f283db6cfe58cf Mon Sep 17 00:00:00 2001
From: Guillaume Raffy <guillaume.raffy@univ-rennes1.fr>
Date: Fri, 27 Jan 2023 14:31:11 +0100
Subject: [PATCH] =?UTF-8?q?extraction=20des=20achats=20informatiques=20?=
 =?UTF-8?q?=C3=A0=20partir=20de=20commandes-2019-cnrs-t002.tsv=20:=20c'est?=
 =?UTF-8?q?=20beaucoup=20plus=20complet?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore  |  5 ++++
 src/main.py | 71 +++++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 71 insertions(+), 5 deletions(-)
 create mode 100644 .gitignore
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a6204a1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+tmp
+*.xls
+*.xlsx
+.~lock*
+*.docx
diff --git a/src/main.py b/src/main.py
index aaaf357..9a764bc 100644
--- a/src/main.py
+++ b/src/main.py
@@ -4,7 +4,8 @@ import re
 import pandas
 
 
-def cnrsformat1_to_sheet(in_tsv_file_path: Path, out_tsv_file_path: Path):
+# converts a cnrs geslab type t001 report to a single table
+def geslabt001_to_sheet(in_tsv_file_path: Path, out_tsv_file_path: Path):
     with open(in_tsv_file_path) as inf, open(out_tsv_file_path, 'wt') as outf:
         table_header_has_been_written = False
         for line in inf.readlines():
@@ -25,14 +26,36 @@ def cnrsformat1_to_sheet(in_tsv_file_path: Path, out_tsv_file_path: Path):
                 outf.write(line)
 
 
-def main():
-    cnrsformat1_file_path = Path('./from-cloud.ipr/2019/commandes-2019-cnrs-t001.tsv')
+# converts a cnrs geslab type t001 report to a single table
+def geslabt002_to_sheet(in_tsv_file_path: Path, out_tsv_file_path: Path):
+    with open(in_tsv_file_path) as inf, open(out_tsv_file_path, 'wt') as outf:
+        table_header_has_been_written = False
+        for line in inf.readlines():
+            # Entité dépensière : 				AESJULLIEN		AES RENNES METROPOLE MC JULLIEN									Crédits reçus : 								40,000.00	
+            # 															Disponible : 								24,743.14	
+            # 																								
+            # 																								
+            # N° commande	Souche		Libellé commande				Date commande	Raison sociale fournisseur	Montant consommé sur exercice antérieur	Montant consommé sur l'exercice			Montant réservé					Montant facturé		Code origine	Nature dépense	Statut		Cde groupée            
+            is_table_header = re.match(r'^N° com. GESLAB', line) is not None
+            # for some strange reason, the column 'N° com. GESLAB''s contents are alternatively something like '1952-12-17 12:00:00 AM' and something like '19,855.00'
+            if is_table_header and not table_header_has_been_written:
+                outf.write('# %s' % line)
+                table_header_has_been_written = True
+            if re.match(r'^[0-9,.]+\t', line):
+                outf.write(line)
+            elif re.match(r'^[0-9][0-9][0-9][0-9]-[0-9]+-[0-9]+ [0-9][0-9]:[0-9][0-9]:[0-9][0-9] [AP]M\t', line):
+                outf.write(line)
+            else:
+                print('ignoring line : %s' % line)
+
+
+def geslabt001_to_itorders(geslabt001_file_path: Path, itorders_file_path: Path):
     sheet_file_path = Path('./tmp/commandes-2019-cnrs.tsv')
-    cnrsformat1_to_sheet(cnrsformat1_file_path, sheet_file_path)
+    geslabt001_to_sheet(geslabt001_file_path, sheet_file_path)
 
     df = pandas.read_csv(sheet_file_path, sep='\t')
 
-    # delete the colums for which the labve is of the form 'Unnamed: <n>'. They come from the csv export of libre office
+    # delete the colums for which the label is of the form 'Unnamed: <n>'. They come from the csv export of libre office
     unnamed_columns = [column_label for column_label in df.keys() if re.match(r'^Unnamed', column_label) is not None]
     print(unnamed_columns)
     df = df.drop(columns=unnamed_columns)
@@ -48,6 +71,44 @@ def main():
     print(it_df)
 
     print(it_df[['Montant facturé', 'Raison sociale fournisseur', 'Libellé commande']])
+    it_df.to_csv(itorders_file_path, sep='\t')
+
+
+def geslabt002_to_itorders(geslabt001_file_path: Path, itorders_file_path: Path):
+    sheet_file_path = Path('./tmp/commandes-2019-cnrs.tsv')
+    geslabt002_to_sheet(geslabt001_file_path, sheet_file_path)
+
+    df = pandas.read_csv(sheet_file_path, sep='\t')
+
+    # delete the colums for which the labve is of the form 'Unnamed: <n>'. They come from the csv export of libre office
+    unnamed_columns = [column_label for column_label in df.keys() if re.match(r'^Unnamed', column_label) is not None]
+    print(unnamed_columns)
+    df = df.drop(columns=unnamed_columns)
+
+    print(df.columns)
+    print(df.keys())
+    print(df)
+    PETIT_MATERIEL_INFORMATIQUE = '1100'
+    EQUIPEMENT_INFORMATIQUE = '2100'
+    INFORMATIQUE_ACHAT = 'D3--'
+    it_df = df[(df['Matière'] == PETIT_MATERIEL_INFORMATIQUE) | (df['Matière'] == EQUIPEMENT_INFORMATIQUE) | (df['Matière'] == INFORMATIQUE_ACHAT)]
+    print(it_df)
+
+    # to remove clutter, drop the columns that we don't need
+    print(it_df.keys())
+    it_df = it_df.drop(columns=['# N° com. GESLAB'])  # this column seems to contain anything but ordering number
+    it_df = it_df.drop(columns=['N° ligne'])  # I don't know the meaning of this column
+    it_df = it_df.drop(columns=['Code origine'])  # I don't know the meaning of this column
+    it_df = it_df.drop(columns=['Elément analytique'])  # I don't know the meaning of this column
+    it_df = it_df.drop(columns=['S'])  # I don't know the meaning of this column
+
+    print(it_df[['Facturé ligne', 'Raison sociale fournisseur', 'Libellé ligne']])
+    it_df.to_csv(itorders_file_path, sep='\t')
+
+
+def main():
+    geslabt001_to_itorders(Path('./achats-ipr/2019/commandes-2019-cnrs-t001.tsv'), Path('./tmp/commandes-it-2019-cnrs-001.tsv'))
+    geslabt002_to_itorders(Path('./achats-ipr/2019/commandes-2019-cnrs-t002.tsv'), Path('./tmp/commandes-it-2019-cnrs-002.tsv'))
 
 
 main()