adapted concho's dell html parser to dell's 2021 web pages format

note : dell's 2020 web pages are still supported
This commit is contained in:
Guillaume Raffy 2021-04-07 23:07:11 +02:00
parent 82359a549c
commit 048de6fed9
3 changed files with 845 additions and 71 deletions

File diff suppressed because one or more lines are too long

View File

@ -381,23 +381,22 @@ class DellConfiguratorParser():
def __init__(self):
pass
@classmethod
def _get_module(cls, root_element, section_label):
modules_element = root_element.xpath(".//div[@class='col-md-10']")[0]
def _get_module(self, root_element, section_label):
modules_element = root_element.xpath(self.get_xpath_filter('root_to_modules_element'))[0]
# print(modules_element)
for module_root in modules_element.xpath(".//div[@class='col-md-12 module']"):
for module_root in modules_element.xpath(self.get_xpath_filter('modules_element_to_modules')):
# print(module_root)
# blue modules such as "Processeurs (Passage)"
module_titles = module_root.xpath(".//div[@class='col-md-4 module-title color-017EB8']")
module_titles = module_root.xpath(self.get_xpath_filter('module_to_blue_title'))
if len(module_titles) > 0:
# print(module_title.text)
# print(len(module_title.text))
module_title = module_titles[0]
# print('module_title.text = %s ' % module_title.text)
# print(module_title.text_content())
if module_title.text == section_label:
return module_root
# grey modules such as 'Base'
module_titles = module_root.xpath(".//div[@class='col-md-4 module-title color-808080']")
module_titles = module_root.xpath(self.get_xpath_filter('module_to_grey_title'))
if len(module_titles) > 0:
# print(module_title.text)
# print(len(module_title.text))
@ -405,25 +404,33 @@ class DellConfiguratorParser():
# print(module_title.text_content())
if module_title.text == section_label:
return module_root
assert False, 'failed to find module "%s"' % section_label
@classmethod
def price_str_as_float(cls, price_as_str):
match = re.match(r'^\s*(?P<sign>[-+]?)\s*(?P<numbers>[0-9.]*)\s*€\s*$', price_as_str.replace(',',''))
assert match, 'unexpected price string (%s)' % price_as_str
# print(match['sign'], match['numbers'])
price_as_float = float("%s%s" % (match['sign'], match['numbers']))
return price_as_float
@abstractmethod
def price_str_as_float(self, price_as_str):
assert False
@abstractmethod
def get_module_label(self, module_id):
assert False
@abstractmethod
def get_xpath_filter(self, filter_id):
assert False
@abstractmethod
def get_base_price(self, html_root):
assert False
def _parse_proc_change_options(self, html_root):
proc_options = Module('processor-change')
#module_root_element = DellConfiguratorParser._get_module(html_root, 'Processeurs (Passage)')
module_root_element = DellConfiguratorParser._get_module(html_root, 'Processeurs (Passage)')
for option_root_element in module_root_element.xpath(".//div[@class='row']"):
label_elements = option_root_element.xpath(".//div[@class='option-not-selected ']")
#module_root_element = self._get_module(html_root, 'Processeurs (Passage)')
module_root_element = self._get_module(html_root, self.get_module_label('cpu_change'))
for option_root_element in module_root_element.xpath(self.get_xpath_filter('module_to_options')):
label_elements = option_root_element.xpath(self.get_xpath_filter('option_to_label'))
if len(label_elements) > 0:
label = label_elements[0].text_content().replace('\n', '')
price = DellConfiguratorParser.price_str_as_float(option_root_element.xpath(".//div[@class='col-md-3 text-right option-price ']")[0].text_content())
price = self.price_str_as_float(option_root_element.xpath(self.get_xpath_filter('option_to_price'))[0].text_content())
# print(label, price)
num_cpus = 1
# Passage à processeur Intel Xeon Gold 6240L 2.6GHz, 24.75M Cache,10.40GT/s, 2UPI, Turbo, HT,18C/36T (150W) - DDR4-2933
@ -444,13 +451,13 @@ class DellConfiguratorParser():
def _parse_proc_options(self, html_root):
proc_options = Module('processor')
#module_root_element = DellConfiguratorParser._get_module(html_root, 'Processeurs (Passage)')
module_root_element = DellConfiguratorParser._get_module(html_root, 'Processeurs additionnels')
for option_root_element in module_root_element.xpath(".//div[@class='row']"):
label_elements = option_root_element.xpath(".//div[@class='option-not-selected ']")
#module_root_element = self._get_module(html_root, 'Processeurs (Passage)')
module_root_element = self._get_module(html_root, self.get_module_label('additional_cpus'))
for option_root_element in module_root_element.xpath(self.get_xpath_filter('module_to_options')):
label_elements = option_root_element.xpath(self.get_xpath_filter('option_to_label'))
if len(label_elements) > 0:
label = label_elements[0].text_content()
price = DellConfiguratorParser.price_str_as_float(option_root_element.xpath(".//div[@class='col-md-3 text-right option-price ']")[0].text_content())
price = self.price_str_as_float(option_root_element.xpath(self.get_xpath_filter('option_to_price'))[0].text_content())
# print(label, price)
num_additional_cpus = 1
match = re.match(r'^Processeur additionnel Intel Xeon (?P<cpu_class>Silver|Gold|Platinium) (?P<cpu_number>[0-9][0-9][0-9][0-9][RLY]?).*', label)
@ -471,16 +478,16 @@ class DellConfiguratorParser():
def _parse_ram_options(self, html_root):
ram_options = Module('ram')
#module_root_element = DellConfiguratorParser._get_module(html_root, 'Processeurs (Passage)')
module_root_element = DellConfiguratorParser._get_module(html_root, 'Mémoire: Ajout de barettes additionnelles')
for option_root_element in module_root_element.xpath(".//div[@class='row']"):
label_elements = option_root_element.xpath(".//div[@class='option-not-selected ']")
#module_root_element = self._get_module(html_root, 'Processeurs (Passage)')
module_root_element = self._get_module(html_root, self.get_module_label('ram_additions'))
for option_root_element in module_root_element.xpath(self.get_xpath_filter('module_to_options')):
label_elements = option_root_element.xpath(self.get_xpath_filter('option_to_label'))
if len(label_elements) > 0:
label = label_elements[0].text_content()
price = DellConfiguratorParser.price_str_as_float(option_root_element.xpath(".//div[@class='col-md-3 text-right option-price ']")[0].text_content())
price = self.price_str_as_float(option_root_element.xpath(self.get_xpath_filter('option_to_price'))[0].text_content())
# print(label, price)
# Ajout d'une barette de 128Go 2667 Mhz LRDIMM
match = re.match(r'^Ajout d\'une barette de (?P<num_gb>[0-9]+)Go (?P<num_mhz>[0-9][0-9][0-9][0-9]) Mhz (?P<mem_type>LRDIMM|RDIMM)$', label)
match = re.match(r'^Ajout d\'une barette de (?P<num_gb>[0-9]+)Go (?P<num_mhz>[0-9][0-9][0-9][0-9]) *M[Hh]z (?P<mem_type>LRDIMM|RDIMM)$', label)
if match:
# print(match['num_gb'], match['num_mhz'])
@ -505,19 +512,9 @@ class DellConfiguratorParser():
assert len(ram_options.options) > 0
return ram_options
@classmethod
def _get_module_default_item(cls, module_label, html_root):
module_root_element = DellConfiguratorParser._get_module(html_root, module_label)
assert module_root_element is not None
for option_root_element in module_root_element.xpath(".//div[@class='row']"):
label_elements = option_root_element.xpath(".//div[@class='option-selected ']")
assert label_elements is not None
if len(label_elements) > 0:
label = label_elements[0].text_content().replace('\n', '')
price = DellConfiguratorParser.price_str_as_float(option_root_element.xpath(".//div[@class='col-md-3 text-right option-price option-price-selected']")[0].text_content())
assert price == 0.0, 'default items are expected to have a price of 0.0 € (%s s price is %f)' % (label, price)
return label
assert False, 'failed to find the default item of module %s' % module_label
@abstractmethod
def _get_module_default_item(self, module_label, html_root):
assert False
def _parse_base_config(self, html_root, configurator):
base_config = Config(configurator)
@ -525,7 +522,7 @@ class DellConfiguratorParser():
base_config.num_cpu_per_server = 1
# initialize cpu
item_label = DellConfiguratorParser._get_module_default_item('Processeurs (Passage)', html_root)
item_label = self._get_module_default_item('Processeurs (Passage)', html_root)
# Processeur Intel Xeon Silver 4208 2.1GHz,11M Cache,9.60GT/s, 2UPI,No Turbo, HT,8C/16T (85W) - DDR4-2400
match = re.match(r'^Processeur Intel Xeon (?P<cpu_class>Silver|Gold|Platinium) (?P<cpu_number>[0-9][0-9][0-9][0-9][RLYU]?).*', item_label)
if match is None:
@ -537,7 +534,7 @@ class DellConfiguratorParser():
base_config.set_cpu(Cpu(cpu_id))
# initialize the default ram dimms
item_label = DellConfiguratorParser._get_module_default_item('Mémoires (Passage)', html_root)
item_label = self._get_module_default_item(self.get_module_label('ram_change'), html_root)
# Mémoire 16 Go DDR4 à 2933MHz (1x16Go)
match = re.match(r'^Mémoire (?P<num_gb>[0-9]+) Go DDR4 à (?P<num_mhz>[0-9]+)MHz \((?P<num_dimms>[0-9]+)x(?P<num_gb_per_dimm>[0-9]+)Go\)', item_label)
assert match, 'unhandled label : %s' % item_label
@ -610,18 +607,21 @@ class DellConfiguratorParser():
# modules_element = body.xpath("//div[@class='col-md-10']")
module_root_element = DellConfiguratorParser._get_module(html_root, 'Base')
module_root_element = self._get_module(html_root, 'Base')
assert module_root_element is not None
for option_root_element in module_root_element.xpath(".//div[@class='row']"):
label_elements = option_root_element.xpath(".//div[@class='option-selected ']")
assert len(label_elements) > 0
label = label_elements[0].text_content().replace('\n', '')
# PowerEdge R640
match = re.match(r'^PowerEdge (?P<chassis_type>[CR][0-9][0-9][0-9][0-9]?).*', label)
assert match, 'unhandled label : %s' % label
# print(match['cpu_class'], match['cpu_number'])
chassis_id = "dell-poweredge-%s" % (match['chassis_type'].lower(), )
configurator.chassis = Option(Chassis(chassis_id), 0.0)
#option_root_elements = module_root_element.xpath(".//div[@class='row']")
#assert len(option_root_elements) > 0
# option_root_element = option_root_elements[0]
label_elements = module_root_element.xpath(self.get_xpath_filter('base_module_to_label'))
assert len(label_elements) > 0
label = label_elements[0].text_content().replace('\n', '')
# PowerEdge R640
match = re.match(r'^PowerEdge (?P<chassis_type>[CR][0-9][0-9][0-9][0-9]?).*', label)
assert match, 'unhandled label : %s' % label
# print(match['cpu_class'], match['cpu_number'])
chassis_id = "dell-poweredge-%s" % (match['chassis_type'].lower(), )
configurator.chassis = Option(Chassis(chassis_id), 0.0)
configurator.base_config = self._parse_base_config(html_root, configurator)
@ -637,6 +637,60 @@ class DellConfiguratorParser():
assert configurator.get_item_price(base_cpu.uid) is not None
# compute the price of the chassis
base_price = self.get_base_price(html_root)
one_cpu_price = configurator.get_item_price(configurator.base_config.cpu.uid)
ram_price = configurator.base_config.ram_price
configurator.chassis.price = base_price - configurator.base_config.num_cpus * one_cpu_price - ram_price
class DellConfiguratorParser2020(DellConfiguratorParser):
def __init__(self):
super().__init__()
def get_module_label(self, module_id):
return {
'cpu_change': 'Processeurs (Passage)',
'additional_cpus': 'Processeurs additionnels',
'ram_change': 'Mémoires (Passage)',
'ram_additions': 'Mémoire: Ajout de barettes additionnelles',
}[module_id]
def get_xpath_filter(self, filter_id):
return {
'root_to_modules_element': ".//div[@class='col-md-10']",
'modules_element_to_modules': ".//div[@class='col-md-12 module']",
'module_to_blue_title': ".//div[@class='col-md-4 module-title color-017EB8']",
'module_to_grey_title': ".//div[@class='col-md-4 module-title color-808080']",
'module_to_options': ".//div[@class='row']",
'option_to_label': ".//div[@class='option-not-selected ']",
'option_to_price': ".//div[@class='col-md-3 text-right option-price ']",
'base_module_to_label': ".//div[@class='option-selected ']",
}[filter_id]
def price_str_as_float(self, price_as_str):
# eg '+ 2,255.00 €'
match = re.match(r'^\s*(?P<sign>[-+]?)\s*(?P<numbers>[0-9.]*)\s*€\s*$', price_as_str.replace(',',''))
assert match, 'unexpected price string (%s)' % price_as_str
# print(match['sign'], match['numbers'])
price_as_float = float("%s%s" % (match['sign'], match['numbers']))
return price_as_float
def _get_module_default_item(self, module_label, html_root):
module_root_element = self._get_module(html_root, module_label)
assert module_root_element is not None
for option_root_element in module_root_element.xpath(".//div[@class='row']"):
label_elements = option_root_element.xpath(".//div[@class='option-selected ']")
assert label_elements is not None
if len(label_elements) > 0:
label = label_elements[0].text_content().replace('\n', '')
price = self.price_str_as_float(option_root_element.xpath(".//div[@class='col-md-3 text-right option-price option-price-selected']")[0].text_content())
assert price == 0.0, 'default items are expected to have a price of 0.0 € (%s s price is %f)' % (label, price)
return label
assert False, 'failed to find the default item of module %s' % module_label
def get_base_price(self, html_root):
base_price = None
price_preview_element = html_root.xpath(".//div[@class='price-preview']")[0]
assert price_preview_element is not None
@ -657,24 +711,108 @@ class DellConfiguratorParser():
if label == 'Prix':
price_value_element = price_element.xpath(".//span[@class='col-md-8']")[0]
assert price_value_element is not None
base_price = DellConfiguratorParser.price_str_as_float(price_value_element.text_content())
base_price = self.price_str_as_float(price_value_element.text_content())
assert base_price is not None
return base_price
class DellConfiguratorParser2021(DellConfiguratorParser):
def __init__(self):
super().__init__()
def get_module_label(self, module_id):
return {
'cpu_change': 'Processeurs (Passage)',
'additional_cpus': 'Processeurs additionnels',
'ram_change': 'Mémoires (Passage)',
'ram_additions': 'Mémoire: Ajout de barettes additionnelles',
}[module_id]
def get_xpath_filter(self, filter_id):
return {
'root_to_modules_element': ".//div[@class='modules']",
'modules_element_to_modules': ".//div[@class='product-module-configuration']",
'module_to_blue_title': ".//header",
'module_to_grey_title': ".//div[@class='col-md-4 module-title color-808080']",
'module_to_options': ".//div[@class='product-options-configuration-line']",
'option_to_label': ".//div[@class='option-info']",
'option_to_price': ".//div[@class='option-price']",
'base_module_to_label': ".//div[@class='product-options-configuration-block option-selected']",
}[filter_id]
def price_str_as_float(self, price_as_str):
# eg '+ 2255,00 €' # contains a Narrow No-Break Space (NNBSP) https://www.compart.com/en/unicode/U+202F
nnbsp = ''
match = re.match(r'^\s*(?P<sign>[-+]?)\s*(?P<numbers>[0-9.]*)\s*€\s*$', price_as_str.replace(',','.').replace(nnbsp, ''))
assert match, 'unexpected price string (%s)' % price_as_str
# print(match['sign'], match['numbers'])
price_as_float = float("%s%s" % (match['sign'], match['numbers']))
return price_as_float
def _get_module_default_item(self, module_label, html_root):
module_root_element = self._get_module(html_root, module_label)
assert module_root_element is not None
if module_label == self.get_module_label('ram_change'):
# <div
# class="product-options-configuration-block option-selected">
# <header>Mémoire 16 Go DDR4 à 3200MHz (1x16Go)<div
# class="option-selector"><i
# class="fas fa-check "></i></div>
# </header>
# <div class="mt-2 option-price">+ 0,00&nbsp;€</div>
# </div>
selected_option_filter = ".//div[@class='product-options-configuration-block option-selected']"
label_filter = ".//header"
price_filter = ".//div[@class='mt-2 option-price']"
else:
selected_option_filter = ".//div[@class='product-options-configuration-line option-selected']"
label_filter = ".//div[@class='option-info']"
price_filter = ".//div[@class='option-price']"
for selected_option_root_element in module_root_element.xpath(selected_option_filter):
label_elements = selected_option_root_element.xpath(label_filter)
assert label_elements is not None
if len(label_elements) > 0:
label = label_elements[0].text_content().replace('\n', '')
price = self.price_str_as_float(selected_option_root_element.xpath(price_filter)[0].text_content())
assert price == 0.0, 'default items are expected to have a price of 0.0 € (%s s price is %f)' % (label, price)
return label
assert False, 'failed to find the default item of module %s' % module_label
def get_base_price(self, html_root):
base_price = None
price_preview_element = html_root.xpath(".//div[@class='product-info']")[0]
assert price_preview_element is not None
for price_element in price_preview_element.xpath(".//div[@class='info']"):
price_label_element = price_element.xpath(".//span[@class='info-label']")[0]
# <div class="info"><span class="info-label">Prix de base</span><span
# class="info-value strong">1175 € HT</span></div>
# <hr>
# <div class="info"><span class="info-label">Avec options</span><span
# class="info-value strong">1175 € HT</span></div>
# <hr>
assert price_label_element is not None
label = price_label_element.text_content().replace('\n', '')
if label == 'Prix de base':
price_value_element = price_element.xpath(".//span[@class='info-value strong']")[0]
assert price_value_element is not None
base_price = self.price_str_as_float(price_value_element.text_content().replace(' HT',''))
assert base_price is not None
return base_price
one_cpu_price = configurator.get_item_price(configurator.base_config.cpu.uid)
ram_price = configurator.base_config.ram_price
configurator.chassis.price = base_price - configurator.base_config.num_cpus * one_cpu_price - ram_price
class DellMatinfoConfigurator(Configurator):
'''
a configurator using the web page from dell matinfo
'''
def __init__(self, dell_configurator_html_file_path):
def __init__(self, dell_configurator_html_file_path, html_parser):
super().__init__(self)
self.base_config = None
self.chassis = None
parser = DellConfiguratorParser()
parser.parse(dell_configurator_html_file_path, self)
html_parser.parse(dell_configurator_html_file_path, self)
def create_config(self):
@ -740,7 +878,7 @@ class DellMatinfoCsvConfigurator(Configurator):
match = re.match(r'^Passage à 2 Processeurs Intel Xeon (?P<cpu_class>Silver|Gold|Platinium) (?P<cpu_number>[0-9][0-9][0-9][0-9][RLYUM]?) .*', label)
if match:
price = DellConfiguratorParser.price_str_as_float(line_cells[COLUMN_PRICE])
price = self.price_str_as_float(line_cells[COLUMN_PRICE])
cpu_class = match['cpu_class'].lower()
if cpu_class == 'platinium':
cpu_class = 'platinum'
@ -752,7 +890,7 @@ class DellMatinfoCsvConfigurator(Configurator):
# Ajout d'une barette de 8Go 2667 Mhz DDR-4 - Pour les 4 serveurs
match = re.match(r'^Ajout d\'une barette de (?P<num_gb>[0-9]+)Go (?P<num_mhz>[0-9][0-9][0-9][0-9]) Mhz (?P<mem_technology>DDR-4) - Pour les 4 serveurs$', label)
if match:
price_for_four = DellConfiguratorParser.price_str_as_float(line_cells[COLUMN_PRICE])
price_for_four = self.price_str_as_float(line_cells[COLUMN_PRICE])
dimm = Dimm(mem_type='rdimm', num_gb=int(match['num_gb']), num_mhz=int(match['num_mhz']))
option = Option(dimm, price_for_four/4.0)
ram_options.add_option(option)

View File

@ -1,5 +1,7 @@
from concho.dell import DellMatinfoCsvConfigurator
from concho.dell import DellMatinfoConfigurator
from concho.dell import DellConfiguratorParser2020
from concho.dell import DellConfiguratorParser2021
from concho.procs_chooser import plot_configurators
from concho.procs_chooser import ConfigPrice
from concho.procs_chooser import ConfigFlops
@ -10,8 +12,8 @@ def test_all_matinfo_2020_configs():
# print(configurator)
configurators = [
DellMatinfoCsvConfigurator('c6420-20200716-price.tsv'),
DellMatinfoConfigurator('rcrc1406676-4834664 - Cat2 Conf4 PowerEdge R640 - Dell.html'),
DellMatinfoConfigurator('rcrc1406676-4824727 - Cat 2 Conf 7 PowerEdge R940 - Dell.html'),
DellMatinfoConfigurator('rcrc1406676-4834664 - Cat2 Conf4 PowerEdge R640 - Dell.html', DellConfiguratorParser2020()),
DellMatinfoConfigurator('rcrc1406676-4824727 - Cat 2 Conf 7 PowerEdge R940 - Dell.html', DellConfiguratorParser2020()),
# dell.DellPowerEdgeR940(),
]
@ -22,8 +24,8 @@ def test_credits_2020_configs():
# print(configurator)
configurators = [
DellMatinfoCsvConfigurator('c6420-20200716-price.tsv'),
DellMatinfoConfigurator('rcrc1406676-4834664 - Cat2 Conf4 PowerEdge R640 - Dell.html'),
DellMatinfoConfigurator('rcrc1406676-4824727 - Cat 2 Conf 7 PowerEdge R940 - Dell.html'),
DellMatinfoConfigurator('rcrc1406676-4834664 - Cat2 Conf4 PowerEdge R640 - Dell.html', DellConfiguratorParser2020()),
DellMatinfoConfigurator('rcrc1406676-4824727 - Cat 2 Conf 7 PowerEdge R940 - Dell.html', DellConfiguratorParser2020()),
# dell.DellPowerEdgeR940(),
]
@ -38,10 +40,30 @@ def test_credits_2020_configs():
# 'intel-xeon-gold-6240',
# ]
config_filter = lambda config : config.get_price() < 15000.0
config_filter = lambda config : config.get_price() < 40000.0
plot_configurators(configurators=configurators, ram_per_core=4.0e9, xaxis_def=ConfigPrice(), yaxis_def=ConfigFlopsPerEuro(), plot_title='physmol/ts credit 2020 configs', config_filter=config_filter)
def test_credits_2021_configs():
configurators = [
DellMatinfoConfigurator('20210407 - Cat2 Conf4 PowerEdge R640 - Dell.html', DellConfiguratorParser2021()),
]
# config_filter = lambda config : config.cpu.uid in [
# 'intel-xeon-gold-5222',
# 'intel-xeon-gold-6226r',
# 'intel-xeon-gold-6230r',
# 'intel-xeon-gold-6234r',
# 'intel-xeon-gold-6240r',
# 'intel-xeon-gold-6248r',
# 'intel-xeon-gold-6230',
# 'intel-xeon-gold-6240',
# ]
config_filter = lambda config : config.get_price() < 40000.0
plot_configurators(configurators=configurators, ram_per_core=4.0e9, xaxis_def=ConfigPrice(), yaxis_def=ConfigFlopsPerEuro(), plot_title='physmol/ts credit 2020 configs', config_filter=config_filter)
if __name__ == '__main__':
test_credits_2020_configs()
test_credits_2021_configs()