Add initial code
[ta/build-tools.git] / tools / script / create_rpm_data.py
1 #!/usr/bin/env python
2 # Copyright 2019 Nokia
3 #
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
7 #
8 #     http://www.apache.org/licenses/LICENSE-2.0
9 #
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15
16 # pylint: disable=too-many-instance-attributes,too-many-arguments
17
18 import argparse
19 import copy
20 import sys
21 import logging
22 import re
23 import json
24 from pprint import pformat
25
26 import os
27
28 from tools.rpm import RpmInfoParser
29 from tools.utils import apply_jenkins_auth
30 from tools.yum import Yum, YumInfoParser
31 from tools.repository import RepositoryConfig
32 from tools.log import set_logging
33 from tools.io import read_from, write_to, read_json
34 from tools.convert import to_json, CsvConverter
35
36
37 class RpmDataBuilder(object):
38     def __init__(self, build_config, yum_info_installed, rpm_info_installed,
39                  crypto_info_installed, boms, remote=False):
40         self.remote = remote
41         self.yum_info_installed = yum_info_installed
42         self.rpm_info_installed = rpm_info_installed
43         self.crypto_info_installed = json.loads(crypto_info_installed)
44         self.boms = boms
45         logging.debug('BOMS: {}'.format(pformat(self.boms)))
46         self.repoconfig = RepositoryConfig(build_config)
47         self.installed_rpms = None
48         self.repos = None
49
50     def run(self):
51         self.installed_rpms = self.read_installed_rpms()
52         srpms = set([rpm['Source RPM'] for rpm in self.installed_rpms])
53         logging.info('Installed RPMs:{} SRPMs:{}'.format(len(self.installed_rpms), len(srpms)))
54         self.repos = self._read_configured_repos()
55         logging.info('Configured repos: {}'.format(len(self.repos)))
56         available_rpms = self._read_available_rpms(self.repos)
57         logging.info('Found {} available RPMs in binary repos'.format(
58             len([rpm for repo_rpms in available_rpms.values() for rpm in repo_rpms])))
59         for i_rpm in self.installed_rpms:
60             i_rpm_repo_name = self._get_rpm_available_in(i_rpm, available_rpms)
61             i_rpm['Repo data'] = self._get_repo(i_rpm_repo_name)
62             i_rpm['Obsoletes'] = self._resolve_obsoletes(i_rpm)
63             i_rpm['Crypto capable'] = self._resolve_ecc(str(i_rpm))
64             i_rpm['BOM'] = self._resolve_bom(i_rpm)
65         self._log_repo_rpm_statistics()
66         self._log_rpm_statistics()
67         return self.installed_rpms
68
69     @staticmethod
70     def _resolve_obsoletes(rpm):
71         if 'Obsoletes' not in rpm:
72             return 'N/A'
73         elif rpm['Obsoletes'] == '(none)':
74             return 'N/A'
75         return rpm['Obsoletes']
76
77     def _resolve_ecc(self, rpm):
78         for item in self.crypto_info_installed:
79             if item['name'] == rpm:
80                 return True
81         return False
82
83     def _resolve_bom(self, rpm):
84         bom_content = self.boms.get(str(rpm))
85         if bom_content is None:
86             return ''
87         self._validate_bom(str(rpm), bom_content)
88         return bom_content['bom']
89
90     @staticmethod
91     def _validate_bom(rpm_name, bom_content):
92         try:
93             if 'bom' not in bom_content:
94                 raise Exception('BOM base object "bom" missing')
95             bom = bom_content['bom']
96             for material in bom:
97                 for key in ['name', 'version', 'source-url', 'foss']:
98                     if key not in material:
99                         raise Exception('Key "{}" not found in BOM'.format(key))
100                 if material['foss'].lower() not in ['yes', 'no', 'modified']:
101                     raise Exception('BOM foss value not valid')
102             missing_crypto_count = len([material for material in bom if
103                                         'crypto-capable' not in material])
104             if missing_crypto_count != 0:
105                 logging.warning(
106                     'crypto-capable missing from %s materials in RPM %s',
107                     missing_crypto_count, rpm_name)
108         except Exception as e:
109             correct_format = {'bom': [
110                 {'name': '<component-name>',
111                  'version': '<component-version>',
112                  'source-url': '<source-url>',
113                  'foss': '<yes/no/modified>',
114                  'crypto-capable': '<true/false (OPTIONAL)>'}]}
115             msg_fmt = 'BOM for {rpm} is not correct format. {error}:\n{correct_format}'
116             raise Exception(msg_fmt.format(rpm=rpm_name,
117                                            error=str(e),
118                                            correct_format=pformat(correct_format)))
119
120     def _get_repo(self, name):
121         for r in self.repos:
122             if r['name'] == name:
123                 return r
124         raise Exception('No repository found with name: {}'.format(name))
125
126     def read_installed_rpms(self):
127         installed_rpms = []
128         yum_rpms = YumInfoParser().parse_installed(self.yum_info_installed)
129         rpm_rpms = RpmInfoParser().parse_multiple(self.rpm_info_installed)
130         self._validate_rpm_lists_identical(yum_rpms, rpm_rpms)
131         yum_rpms_dict = {rpm['Name']: rpm for rpm in yum_rpms}
132         for rpm_data in rpm_rpms:
133             yum_data = yum_rpms_dict[rpm_data['Name']]
134             combined_data = self._combine_rpm_data(rpm_data, yum_data)
135             installed_rpms.append(combined_data)
136         logging.debug('One parsed RPM data as example:\n{}'.format(pformat(installed_rpms[0])))
137         return installed_rpms
138
139     def _combine_rpm_data(self, rpm_data, yum_data):
140         combined_data = copy.deepcopy(rpm_data)
141         fields_known_to_differ = ['Description',  # May contain deffering newline and indentation
142                                   'Size']  # Bytes in RPM, humanreadable in yum
143         yum2rpm_field_name_map = {'Arch': 'Architecture'}
144         for yum_key in yum_data:
145             if yum_key in yum2rpm_field_name_map:
146                 rpm_key = yum2rpm_field_name_map[yum_key]
147             else:
148                 rpm_key = yum_key
149             if rpm_key in combined_data:
150                 yum_comparable_rpm_string = self._rpm_info_str_to_yum_info_str(
151                     combined_data[rpm_key])
152                 if yum_comparable_rpm_string != yum_data[yum_key]:
153                     if rpm_key in fields_known_to_differ:
154                         continue
155                     raise Exception(
156                         'RPM data in "{}" not match in rpm "{}" vs yum "{}" for package {}'.format(
157                             rpm_key,
158                             repr(combined_data[rpm_key]),
159                             repr(yum_data[yum_key]),
160                             combined_data))
161             else:
162                 combined_data[rpm_key] = yum_data[yum_key]
163         return combined_data
164
165     @staticmethod
166     def _rpm_info_str_to_yum_info_str(string):
167         try:
168             string.decode()
169         except (UnicodeEncodeError, UnicodeDecodeError):
170             return re.sub(r'[^\x00-\x7F]+', '?', string)
171         except Exception as e:
172             logging.error('{}: for string {}'.format(str(e), repr(string)))
173             raise
174         return string
175
176     @staticmethod
177     def _validate_rpm_lists_identical(yum_rpms, rpm_rpms):
178         yum_rpms_dict = {rpm['Name']: rpm for rpm in yum_rpms}
179         rpm_rpms_dict = {rpm['Name']: rpm for rpm in rpm_rpms}
180         if len(yum_rpms) != len(rpm_rpms):
181             raise Exception(
182                 'Given RPM lists are unequal: yum RPM count {} != rpm RPM count {}'.format(
183                     len(yum_rpms), len(rpm_rpms)))
184         assert sorted(yum_rpms_dict.keys()) == sorted(rpm_rpms_dict.keys())
185         for name in yum_rpms_dict.keys():
186             if not yum_rpms_dict[name].is_same_package_as(rpm_rpms_dict[name]):
187                 raise Exception(
188                     'Packages are not same: yum {} != rpm {}'.format(yum_rpms_dict[name],
189                                                                      rpm_rpms_dict[name]))
190
191     def _read_configured_repos(self):
192         repos = self.repoconfig.read_sections(
193             ['baseimage-repositories', 'repositories'])
194         repos.append(self.repoconfig.get_localrepo(remote=True))
195         logging.debug('Configured repos: {}'.format(pformat(repos)))
196         return repos
197
198     def _read_available_rpms(self, repos):
199         Yum.clean_and_remove_cache()
200         yum = Yum()
201         for repo in repos:
202             name = repo['name']
203             if name == 'localrepo':
204                 if self.remote:
205                     url = self.repoconfig.get_localrepo(remote=True)['baseurl']
206                     yum.add_repo(name, apply_jenkins_auth(url))
207                 else:
208                     url = self.repoconfig.get_localrepo(remote=False)['baseurl']
209                     yum.add_repo(name, url)
210             else:
211                 yum.add_repo(name, repo['baseurl'])
212         yum_available_output = yum.read_all_packages()
213         available_rpms = YumInfoParser().parse_available(yum_available_output)
214         rpms_per_repo = {}
215         for rpm in available_rpms:
216             repo = rpm.get('Repo')
217             if repo not in rpms_per_repo:
218                 rpms_per_repo[repo] = []
219             rpms_per_repo[repo].append(rpm)
220         return rpms_per_repo
221
222     def _log_repo_rpm_statistics(self):
223         logging.info('--- RPM repo statistics ---')
224         for repo in self.repos:
225             name = repo['name']
226             repo_url = repo['baseurl']
227             if name in [r['name'] for r in self._get_nonerepos()]:
228                 expected_from_repo = None
229             else:
230                 expected_from_repo = name
231             repo_installed_rpm_count = len([rpm for rpm in self.installed_rpms if
232                                             rpm['Repo data']['baseurl'] == repo_url and rpm.get(
233                                                 'From repo') == expected_from_repo])
234             logging.info(
235                 'RPMs installed from repo "{}": {}'.format(name, repo_installed_rpm_count))
236             if repo_installed_rpm_count is 0:
237                 logging.warning(
238                     'Repository configured but no RPMs installed: {}={}'.format(name, repo_url))
239
240         return self.installed_rpms
241
242     def _log_rpm_statistics(self):
243         def _get_count(func):
244             return len([rpm for rpm in self.installed_rpms if func(rpm)])
245
246         logging.info('----- RPMs per type -----')
247         logging.info(' => Total: %s', len(self.installed_rpms))
248         logging.info('----- RPMs per attribute -----')
249         logging.info(' * Crypto capable: %s', _get_count(lambda rpm: rpm['Crypto capable']))
250         logging.info(' * Complex (BOM): %s', _get_count(lambda rpm: rpm['BOM']))
251
252     def _get_rpm_available_in(self, rpm, available_rpms):
253         if 'From repo' in rpm.keys():
254             if rpm['From repo'] == 'localrepo':
255                 return 'localrepo'
256             available_repo_rpms = available_rpms[rpm['From repo']]
257             for a_rpm in available_repo_rpms:
258                 if self._is_same_rpm(a_rpm, rpm):
259                     return rpm['From repo']
260             rpms_in_matching_repo = [str(a_rpm) for a_rpm in available_repo_rpms]
261             rpms_with_matching_name = [str(a_rpm) for a_rpm in available_repo_rpms if
262                                        rpm['Name'] == a_rpm['Name']]
263             if len(rpms_in_matching_repo) <= 1000:
264                 logging.debug(
265                     'Available RPMs in {}: {}'.format(rpm['From repo'], rpms_in_matching_repo))
266             error_str = 'RPM "{}" is not available in configured repo: {}, ' \
267                         'RPMs with correct name: {}'.format(str(rpm), rpm['From repo'],
268                                                             rpms_with_matching_name)
269             raise Exception(error_str)
270         else:
271             none_repos = self._get_nonerepos()
272             for repo in [r['name'] for r in none_repos]:
273                 for a_rpm in available_rpms[repo]:
274                     if self._is_same_rpm(a_rpm, rpm):
275                         return repo
276             msg = 'RPM "{}" is not available in any configured "none*" repos: {}'.format(
277                 rpm['Name'], none_repos)
278             raise Exception(msg)
279
280     def _get_nonerepos(self):
281         return [repo for repo in self.repos if re.match(r'^none\d+$', repo['name'])]
282
283     @staticmethod
284     def _is_same_rpm(rpm1, rpm2):
285         return rpm1['Name'] == rpm2['Name'] and \
286                rpm1['Version'] == rpm2['Version'] and \
287                rpm1['Release'] == rpm2['Release'] and \
288                rpm1['Arch'] == rpm2['Architecture']
289
290
291 def parse(args):
292     p = argparse.ArgumentParser(
293         description='Generate package info',
294         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
295     p.add_argument('--verbose', '-v', action='store_true',
296                    help='More verbose logging')
297     p.add_argument('--yum-info-path', required=True,
298                    help='"yum info all" output as file')
299     p.add_argument('--rpm-info-path', required=True,
300                    help='"rpm -qai" output as file')
301     p.add_argument('--crypto-info-path',
302                    help='Dir from where to find ECC file')
303     p.add_argument('--boms-path',
304                    help='Dir from where to find RPM bill of material files')
305     p.add_argument('--output-rpmlist',
306                    help='output as rpm list like "rpm-qa"')
307     p.add_argument('--output-json',
308                    help='output json file path')
309     p.add_argument('--output-csv',
310                    help='output csv file path')
311     p.add_argument('--output-ms-csv',
312                    help='output Microsoft Excel compatible csv file path')
313     p.add_argument('--build-config-path', required=True,
314                    help='Build configuration ini path')
315     p.add_argument('--remote', action='store_true',
316                    help='Read localrepo from remote defined by BUILD_URL, '
317                         'otherwise use localrepo from WORKSPACE')
318     args = p.parse_args(args)
319     return args
320
321
322 def read_files(boms_dir):
323     boms = {}
324     for f in os.listdir(boms_dir):
325         boms[f] = read_json(boms_dir + '/' + f)
326     return boms
327
328
329 def main(input_args):
330     args = parse(input_args)
331     if args.verbose:
332         set_logging(debug=True, timestamps=True)
333     else:
334         set_logging(debug=False)
335     rpmdata = RpmDataBuilder(args.build_config_path,
336                              read_from(args.yum_info_path),
337                              read_from(args.rpm_info_path),
338                              read_from(args.crypto_info_path),
339                              read_files(args.boms_path),
340                              remote=args.remote).run()
341     if args.output_rpmlist:
342         write_to(args.output_rpmlist, '\n'.join(sorted([str(rpm) for rpm in rpmdata])))
343     if args.output_json:
344         write_to(args.output_json, to_json(rpmdata))
345     csv = CsvConverter(rpmdata, preferred_field_order=['Name', 'Version', 'Release',
346                                                        'License', 'Vendor', 'From repo',
347                                                        'Source RPM'])
348     if args.output_csv:
349         write_to(args.output_csv, str(csv))
350     if args.output_ms_csv:
351         write_to(args.output_ms_csv,
352                  csv.convert_to_ms_excel(text_fields=['Version', 'Size', 'Release']))
353     if not args.output_json and not args.output_csv:
354         print(rpmdata)
355
356
357 if __name__ == "__main__":
358     main(sys.argv[1:])