82beabfeab414ad38c096c60a9a28a55a8838e8d
[ta/build-tools.git] / tools / script / create_rpm_data.py
1 #!/usr/bin/env python
2 # Copyright 2019 Nokia
3 #
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
7 #
8 #     http://www.apache.org/licenses/LICENSE-2.0
9 #
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15
16 # pylint: disable=too-many-instance-attributes,too-many-arguments
17
18 import argparse
19 import copy
20 import sys
21 import logging
22 import re
23 import json
24 from pprint import pformat
25
26 import os
27
28 from tools.rpm import RpmInfoParser
29 from tools.utils import apply_jenkins_auth
30 from tools.yum import Yum, YumInfoParser
31 from tools.repository import RepositoryConfig
32 from tools.log import set_logging
33 from tools.io import read_from, write_to, read_json
34 from tools.convert import to_json, CsvConverter
35
36
37 class RpmDataBuilder(object):
38     def __init__(self, build_config, yum_info_installed, rpm_info_installed,
39                  crypto_info_installed, boms, remote=False):
40         self.remote = remote
41         self.yum_info_installed = yum_info_installed
42         self.rpm_info_installed = rpm_info_installed
43         self.crypto_info_installed = json.loads(crypto_info_installed)
44         self.boms = boms
45         logging.debug('BOMS: {}'.format(pformat(self.boms)))
46         self.repoconfig = RepositoryConfig(build_config)
47         self.installed_rpms = None
48         self.repos = None
49
50     def run(self):
51         self.installed_rpms = self.read_installed_rpms()
52         srpms = set([rpm['Source RPM'] for rpm in self.installed_rpms])
53         logging.info('Installed RPMs:{} SRPMs:{}'.format(len(self.installed_rpms), len(srpms)))
54         self.repos = self._read_configured_repos()
55         logging.info('Configured repos: {}'.format(len(self.repos)))
56         available_rpms = self._read_available_rpms(self.repos)
57         logging.info('Found {} available RPMs in binary repos'.format(
58             len([rpm for repo_rpms in available_rpms.values() for rpm in repo_rpms])))
59         for i_rpm in self.installed_rpms:
60             i_rpm_repo_name = self._get_rpm_available_in(i_rpm, available_rpms)
61             i_rpm['Repo data'] = self._get_repo(i_rpm_repo_name)
62             i_rpm['Obsoletes'] = self._resolve_obsoletes(i_rpm)
63             i_rpm['Crypto capable'] = self._resolve_ecc(str(i_rpm))
64             i_rpm['BOM'] = self._resolve_bom(i_rpm)
65         self._log_repo_rpm_statistics()
66         self._log_rpm_statistics()
67         return self.installed_rpms
68
69     @staticmethod
70     def _resolve_obsoletes(rpm):
71         if 'Obsoletes' not in rpm:
72             return 'N/A'
73         elif rpm['Obsoletes'] == '(none)':
74             return 'N/A'
75         return rpm['Obsoletes']
76
77     def _resolve_ecc(self, rpm):
78         for item in self.crypto_info_installed:
79             if item['name'] == rpm:
80                 return True
81         return False
82
83     def _resolve_bom(self, rpm):
84         bom_content = self.boms.get(str(rpm))
85         if bom_content is None:
86             return ''
87         self._validate_bom(str(rpm), bom_content)
88         return bom_content['bom']
89
90     @staticmethod
91     def _validate_bom(rpm_name, bom_content):
92         try:
93             if 'bom' not in bom_content:
94                 raise Exception('BOM base object "bom" missing')
95             bom = bom_content['bom']
96             for material in bom:
97                 for key in ['name', 'version', 'source-url', 'foss']:
98                     if key not in material:
99                         raise Exception('Key "{}" not found in BOM'.format(key))
100                 if material['foss'].lower() not in ['yes', 'no', 'modified']:
101                     raise Exception('BOM foss value not valid')
102             missing_crypto_count = len([material for material in bom if
103                                         'crypto-capable' not in material])
104             if missing_crypto_count != 0:
105                 logging.warning(
106                     'crypto-capable missing from %s materials in RPM %s',
107                     missing_crypto_count, rpm_name)
108         except Exception as e:
109             correct_format = {'bom': [
110                 {'name': '<component-name>',
111                  'version': '<component-version>',
112                  'source-url': '<source-url>',
113                  'foss': '<yes/no/modified>',
114                  'crypto-capable': '<true/false (OPTIONAL)>'}]}
115             msg_fmt = 'BOM for {rpm} is not correct format. {error}:\n{correct_format}'
116             raise Exception(msg_fmt.format(rpm=rpm_name,
117                                            error=str(e),
118                                            correct_format=pformat(correct_format)))
119
120     def _get_repo(self, name):
121         for r in self.repos:
122             if r['name'] == name:
123                 return r
124         raise Exception('No repository found with name: {}'.format(name))
125
126     def read_installed_rpms(self):
127         installed_rpms = []
128         yum_rpms = YumInfoParser().parse_installed(self.yum_info_installed)
129         rpm_rpms = RpmInfoParser().parse_multiple(self.rpm_info_installed)
130         self._validate_rpm_lists_identical(yum_rpms, rpm_rpms)
131         yum_rpms_dict = {rpm['Name']: rpm for rpm in yum_rpms}
132         for rpm_data in rpm_rpms:
133             yum_data = yum_rpms_dict[rpm_data['Name']]
134             combined_data = self._combine_rpm_data(rpm_data, yum_data)
135             installed_rpms.append(combined_data)
136         logging.debug('One parsed RPM data as example:\n{}'.format(pformat(installed_rpms[0])))
137         return installed_rpms
138
139     def _combine_rpm_data(self, rpm_data, yum_data):
140         combined_data = copy.deepcopy(rpm_data)
141         fields_known_to_differ = ['Description',  # May contain deffering newline and indentation
142                                   'Size']  # Bytes in RPM, humanreadable in yum
143         yum2rpm_field_name_map = {'Arch': 'Architecture'}
144         for yum_key in yum_data:
145             if yum_key in yum2rpm_field_name_map:
146                 rpm_key = yum2rpm_field_name_map[yum_key]
147             else:
148                 rpm_key = yum_key
149             if rpm_key in combined_data:
150                 yum_comparable_rpm_string = self._rpm_info_str_to_yum_info_str(
151                     combined_data[rpm_key])
152                 if yum_comparable_rpm_string != yum_data[yum_key]:
153                     if rpm_key in fields_known_to_differ:
154                         continue
155                     raise Exception(
156                         'RPM data in "{}" not match in rpm "{}" vs yum "{}" for package {}'.format(
157                             rpm_key,
158                             repr(combined_data[rpm_key]),
159                             repr(yum_data[yum_key]),
160                             combined_data))
161             else:
162                 combined_data[rpm_key] = yum_data[yum_key]
163         return combined_data
164
165     @staticmethod
166     def _rpm_info_str_to_yum_info_str(string):
167         try:
168             string.decode()
169         except (UnicodeEncodeError, UnicodeDecodeError):
170             return re.sub(r'[^\x00-\x7F]+', '?', string)
171         except Exception as e:
172             logging.error('{}: for string {}'.format(str(e), repr(string)))
173             raise
174         return string
175
176     @staticmethod
177     def _validate_rpm_lists_identical(yum_rpms, rpm_rpms):
178         yum_rpms_dict = {rpm['Name']: rpm for rpm in yum_rpms}
179         rpm_rpms_dict = {rpm['Name']: rpm for rpm in rpm_rpms}
180         if len(yum_rpms) != len(rpm_rpms):
181             raise Exception(
182                 'Given RPM lists are unequal: yum RPM count {} != rpm RPM count {}'.format(
183                     len(yum_rpms), len(rpm_rpms)))
184         assert sorted(yum_rpms_dict.keys()) == sorted(rpm_rpms_dict.keys())
185         for name in yum_rpms_dict.keys():
186             if not yum_rpms_dict[name].is_same_package_as(rpm_rpms_dict[name]):
187                 raise Exception(
188                     'Packages are not same: yum {} != rpm {}'.format(yum_rpms_dict[name],
189                                                                      rpm_rpms_dict[name]))
190
191     def _read_configured_repos(self):
192         repos = self.repoconfig.read_sections(
193             ['baseimage-repositories', 'repositories'])
194         if 'BUILD_URL' in os.environ:
195             repos.append(self.repoconfig.get_localrepo(remote=True))
196         else:
197             repos.append(self.repoconfig.get_localrepo(remote=False))
198         logging.debug('Configured repos: {}'.format(pformat(repos)))
199         return repos
200
201     def _read_available_rpms(self, repos):
202         Yum.clean_and_remove_cache()
203         yum = Yum()
204         for repo in repos:
205             name = repo['name']
206             if name == 'localrepo':
207                 if self.remote:
208                     url = self.repoconfig.get_localrepo(remote=True)['baseurl']
209                     yum.add_repo(name, apply_jenkins_auth(url))
210                 else:
211                     url = self.repoconfig.get_localrepo(remote=False)['baseurl']
212                     yum.add_repo(name, url)
213             else:
214                 yum.add_repo(name, repo['baseurl'])
215         yum_available_output = yum.read_all_packages()
216         available_rpms = YumInfoParser().parse_available(yum_available_output)
217         rpms_per_repo = {}
218         for rpm in available_rpms:
219             repo = rpm.get('Repo')
220             if repo not in rpms_per_repo:
221                 rpms_per_repo[repo] = []
222             rpms_per_repo[repo].append(rpm)
223         return rpms_per_repo
224
225     def _log_repo_rpm_statistics(self):
226         logging.info('--- RPM repo statistics ---')
227         for repo in self.repos:
228             name = repo['name']
229             repo_url = repo['baseurl']
230             if name in [r['name'] for r in self._get_nonerepos()]:
231                 expected_from_repo = None
232             else:
233                 expected_from_repo = name
234             repo_installed_rpm_count = len([rpm for rpm in self.installed_rpms if
235                                             rpm['Repo data']['baseurl'] == repo_url and rpm.get(
236                                                 'From repo') == expected_from_repo])
237             logging.info(
238                 'RPMs installed from repo "{}": {}'.format(name, repo_installed_rpm_count))
239             if repo_installed_rpm_count is 0:
240                 logging.warning(
241                     'Repository configured but no RPMs installed: {}={}'.format(name, repo_url))
242
243         return self.installed_rpms
244
245     def _log_rpm_statistics(self):
246         def _get_count(func):
247             return len([rpm for rpm in self.installed_rpms if func(rpm)])
248
249         logging.info('----- RPMs per type -----')
250         logging.info(' => Total: %s', len(self.installed_rpms))
251         logging.info('----- RPMs per attribute -----')
252         logging.info(' * Crypto capable: %s', _get_count(lambda rpm: rpm['Crypto capable']))
253         logging.info(' * Complex (BOM): %s', _get_count(lambda rpm: rpm['BOM']))
254
255     def _get_rpm_available_in(self, rpm, available_rpms):
256         if 'From repo' in rpm.keys():
257             if rpm['From repo'] == 'localrepo':
258                 return 'localrepo'
259             available_repo_rpms = available_rpms[rpm['From repo']]
260             for a_rpm in available_repo_rpms:
261                 if self._is_same_rpm(a_rpm, rpm):
262                     return rpm['From repo']
263             rpms_in_matching_repo = [str(a_rpm) for a_rpm in available_repo_rpms]
264             rpms_with_matching_name = [str(a_rpm) for a_rpm in available_repo_rpms if
265                                        rpm['Name'] == a_rpm['Name']]
266             if len(rpms_in_matching_repo) <= 1000:
267                 logging.debug(
268                     'Available RPMs in {}: {}'.format(rpm['From repo'], rpms_in_matching_repo))
269             error_str = 'RPM "{}" is not available in configured repo: {}, ' \
270                         'RPMs with correct name: {}'.format(str(rpm), rpm['From repo'],
271                                                             rpms_with_matching_name)
272             raise Exception(error_str)
273         else:
274             none_repos = self._get_nonerepos()
275             for repo in [r['name'] for r in none_repos]:
276                 for a_rpm in available_rpms[repo]:
277                     if self._is_same_rpm(a_rpm, rpm):
278                         return repo
279             msg = 'RPM "{}" is not available in any configured "none*" repos: {}'.format(
280                 rpm['Name'], none_repos)
281             raise Exception(msg)
282
283     def _get_nonerepos(self):
284         return [repo for repo in self.repos if re.match(r'^none\d+$', repo['name'])]
285
286     @staticmethod
287     def _is_same_rpm(rpm1, rpm2):
288         return rpm1['Name'] == rpm2['Name'] and \
289                rpm1['Version'] == rpm2['Version'] and \
290                rpm1['Release'] == rpm2['Release'] and \
291                rpm1['Arch'] == rpm2['Architecture']
292
293
294 def parse(args):
295     p = argparse.ArgumentParser(
296         description='Generate package info',
297         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
298     p.add_argument('--verbose', '-v', action='store_true',
299                    help='More verbose logging')
300     p.add_argument('--yum-info-path', required=True,
301                    help='"yum info all" output as file')
302     p.add_argument('--rpm-info-path', required=True,
303                    help='"rpm -qai" output as file')
304     p.add_argument('--crypto-info-path',
305                    help='Dir from where to find ECC file')
306     p.add_argument('--boms-path',
307                    help='Dir from where to find RPM bill of material files')
308     p.add_argument('--output-rpmlist',
309                    help='output as rpm list like "rpm-qa"')
310     p.add_argument('--output-json',
311                    help='output json file path')
312     p.add_argument('--output-csv',
313                    help='output csv file path')
314     p.add_argument('--output-ms-csv',
315                    help='output Microsoft Excel compatible csv file path')
316     p.add_argument('--build-config-path', required=True,
317                    help='Build configuration ini path')
318     p.add_argument('--remote', action='store_true',
319                    help='Read localrepo from remote defined by BUILD_URL, '
320                         'otherwise use localrepo from WORKSPACE')
321     args = p.parse_args(args)
322     return args
323
324
325 def read_files(boms_dir):
326     boms = {}
327     for f in os.listdir(boms_dir):
328         boms[f] = read_json(boms_dir + '/' + f)
329     return boms
330
331
332 def main(input_args):
333     args = parse(input_args)
334     if args.verbose:
335         set_logging(debug=True, timestamps=True)
336     else:
337         set_logging(debug=False)
338     rpmdata = RpmDataBuilder(args.build_config_path,
339                              read_from(args.yum_info_path),
340                              read_from(args.rpm_info_path),
341                              read_from(args.crypto_info_path),
342                              read_files(args.boms_path),
343                              remote=args.remote).run()
344     if args.output_rpmlist:
345         write_to(args.output_rpmlist, '\n'.join(sorted([str(rpm) for rpm in rpmdata])))
346     if args.output_json:
347         write_to(args.output_json, to_json(rpmdata))
348     csv = CsvConverter(rpmdata, preferred_field_order=['Name', 'Version', 'Release',
349                                                        'License', 'Vendor', 'From repo',
350                                                        'Source RPM'])
351     if args.output_csv:
352         write_to(args.output_csv, str(csv))
353     if args.output_ms_csv:
354         write_to(args.output_ms_csv,
355                  csv.convert_to_ms_excel(text_fields=['Version', 'Size', 'Release']))
356     if not args.output_json and not args.output_csv:
357         print(rpmdata)
358
359
360 if __name__ == "__main__":
361     main(sys.argv[1:])