Fix an issue related to searching anaconda repo
[ta/build-tools.git] / tools / script / create_rpm_data.py
1 #!/usr/bin/env python
2 # Copyright 2019 Nokia
3 #
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
7 #
8 #     http://www.apache.org/licenses/LICENSE-2.0
9 #
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15
16 # pylint: disable=too-many-instance-attributes,too-many-arguments
17
18 import argparse
19 import copy
20 import sys
21 import logging
22 import re
23 import json
24 from pprint import pformat
25
26 import os
27
28 from tools.rpm import RpmInfoParser
29 from tools.utils import apply_jenkins_auth
30 from tools.yum import Yum, YumInfoParser
31 from tools.repository import RepositoryConfig
32 from tools.log import set_logging
33 from tools.io import read_from, write_to, read_json
34 from tools.convert import to_json, CsvConverter
35
36 LOCAL_REPOS = ['localrepo', 'anaconda']
37
38
39 class RpmDataBuilder(object):
40     def __init__(self, build_config, yum_info_installed, rpm_info_installed,
41                  crypto_info_installed, boms, remote=False):
42         self.remote = remote
43         self.yum_info_installed = yum_info_installed
44         self.rpm_info_installed = rpm_info_installed
45         self.crypto_info_installed = json.loads(crypto_info_installed)
46         self.boms = boms
47         logging.debug('BOMS: {}'.format(pformat(self.boms)))
48         self.repoconfig = RepositoryConfig(build_config)
49         self.installed_rpms = None
50         self.repos = None
51
52     def run(self):
53         self.installed_rpms = self.read_installed_rpms()
54         srpms = set([rpm['Source RPM'] for rpm in self.installed_rpms])
55         logging.info('Installed RPMs:{} SRPMs:{}'.format(len(self.installed_rpms), len(srpms)))
56         self.repos = self._read_configured_repos()
57         logging.info('Configured repos: {}'.format(len(self.repos)))
58         available_rpms = self._read_available_rpms(self.repos)
59         logging.info('Found {} available RPMs in binary repos'.format(
60             len([rpm for repo_rpms in available_rpms.values() for rpm in repo_rpms])))
61         for i_rpm in self.installed_rpms:
62             i_rpm_repo_name = self._get_rpm_available_in(i_rpm, available_rpms)
63             i_rpm['Repo data'] = self._get_repo(i_rpm_repo_name)
64             i_rpm['Obsoletes'] = self._resolve_obsoletes(i_rpm)
65             i_rpm['Crypto capable'] = self._resolve_ecc(str(i_rpm))
66             i_rpm['BOM'] = self._resolve_bom(i_rpm)
67         self._log_repo_rpm_statistics()
68         self._log_rpm_statistics()
69         return self.installed_rpms
70
71     @staticmethod
72     def _resolve_obsoletes(rpm):
73         if 'Obsoletes' not in rpm:
74             return 'N/A'
75         elif rpm['Obsoletes'] == '(none)':
76             return 'N/A'
77         return rpm['Obsoletes']
78
79     def _resolve_ecc(self, rpm):
80         for item in self.crypto_info_installed:
81             if item['name'] == rpm:
82                 return True
83         return False
84
85     def _resolve_bom(self, rpm):
86         bom_content = self.boms.get(str(rpm))
87         if bom_content is None:
88             return ''
89         self._validate_bom(str(rpm), bom_content)
90         return bom_content['bom']
91
92     @staticmethod
93     def _validate_bom(rpm_name, bom_content):
94         try:
95             if 'bom' not in bom_content:
96                 raise Exception('BOM base object "bom" missing')
97             bom = bom_content['bom']
98             for material in bom:
99                 for key in ['name', 'version', 'source-url', 'foss']:
100                     if key not in material:
101                         raise Exception('Key "{}" not found in BOM'.format(key))
102                 if material['foss'].lower() not in ['yes', 'no', 'modified']:
103                     raise Exception('BOM foss value not valid')
104             missing_crypto_count = len([material for material in bom if
105                                         'crypto-capable' not in material])
106             if missing_crypto_count != 0:
107                 logging.warning(
108                     'crypto-capable missing from %s materials in RPM %s',
109                     missing_crypto_count, rpm_name)
110         except Exception as e:
111             correct_format = {'bom': [
112                 {'name': '<component-name>',
113                  'version': '<component-version>',
114                  'source-url': '<source-url>',
115                  'foss': '<yes/no/modified>',
116                  'crypto-capable': '<true/false (OPTIONAL)>'}]}
117             msg_fmt = 'BOM for {rpm} is not correct format. {error}:\n{correct_format}'
118             raise Exception(msg_fmt.format(rpm=rpm_name,
119                                            error=str(e),
120                                            correct_format=pformat(correct_format)))
121
122     def _get_repo(self, name):
123         for r in self.repos:
124             if r['name'] == name:
125                 return r
126         raise Exception('No repository found with name: {}'.format(name))
127
128     def read_installed_rpms(self):
129         installed_rpms = []
130         yum_rpms = YumInfoParser().parse_installed(self.yum_info_installed)
131         rpm_rpms = RpmInfoParser().parse_multiple(self.rpm_info_installed)
132         self._validate_rpm_lists_identical(yum_rpms, rpm_rpms)
133         yum_rpms_dict = {rpm['Name']: rpm for rpm in yum_rpms}
134         for rpm_data in rpm_rpms:
135             yum_data = yum_rpms_dict[rpm_data['Name']]
136             combined_data = self._combine_rpm_data(rpm_data, yum_data)
137             installed_rpms.append(combined_data)
138         logging.debug('One parsed RPM data as example:\n{}'.format(pformat(installed_rpms[0])))
139         return installed_rpms
140
141     def _combine_rpm_data(self, rpm_data, yum_data):
142         combined_data = copy.deepcopy(rpm_data)
143         fields_known_to_differ = ['Description',  # May contain deffering newline and indentation
144                                   'Size']  # Bytes in RPM, humanreadable in yum
145         yum2rpm_field_name_map = {'Arch': 'Architecture'}
146         for yum_key in yum_data:
147             if yum_key in yum2rpm_field_name_map:
148                 rpm_key = yum2rpm_field_name_map[yum_key]
149             else:
150                 rpm_key = yum_key
151             if rpm_key in combined_data:
152                 yum_comparable_rpm_string = self._rpm_info_str_to_yum_info_str(
153                     combined_data[rpm_key])
154                 if yum_comparable_rpm_string != yum_data[yum_key]:
155                     if rpm_key in fields_known_to_differ:
156                         continue
157                     raise Exception(
158                         'RPM data in "{}" not match in rpm "{}" vs yum "{}" for package {}'.format(
159                             rpm_key,
160                             repr(combined_data[rpm_key]),
161                             repr(yum_data[yum_key]),
162                             combined_data))
163             else:
164                 combined_data[rpm_key] = yum_data[yum_key]
165         return combined_data
166
167     @staticmethod
168     def _rpm_info_str_to_yum_info_str(string):
169         try:
170             string.decode()
171         except (UnicodeEncodeError, UnicodeDecodeError):
172             return re.sub(r'[^\x00-\x7F]+', '?', string)
173         except Exception as e:
174             logging.error('{}: for string {}'.format(str(e), repr(string)))
175             raise
176         return string
177
178     @staticmethod
179     def _validate_rpm_lists_identical(yum_rpms, rpm_rpms):
180         yum_rpms_dict = {rpm['Name']: rpm for rpm in yum_rpms}
181         rpm_rpms_dict = {rpm['Name']: rpm for rpm in rpm_rpms}
182         if len(yum_rpms) != len(rpm_rpms):
183             raise Exception(
184                 'Given RPM lists are unequal: yum RPM count {} != rpm RPM count {}'.format(
185                     len(yum_rpms), len(rpm_rpms)))
186         assert sorted(yum_rpms_dict.keys()) == sorted(rpm_rpms_dict.keys())
187         for name in yum_rpms_dict.keys():
188             if not yum_rpms_dict[name].is_same_package_as(rpm_rpms_dict[name]):
189                 raise Exception(
190                     'Packages are not same: yum {} != rpm {}'.format(yum_rpms_dict[name],
191                                                                      rpm_rpms_dict[name]))
192
193     def _read_configured_repos(self):
194         repos = self.repoconfig.read_sections(
195             ['baseimage-repositories', 'repositories'])
196         if 'BUILD_URL' in os.environ:
197             repos.append(self.repoconfig.get_localrepo(remote=True))
198         else:
199             repos.append(self.repoconfig.get_localrepo(remote=False))
200         logging.debug('Configured repos: {}'.format(pformat(repos)))
201         return repos
202
203     def _read_available_rpms(self, repos):
204         Yum.clean_and_remove_cache()
205         yum = Yum()
206         for repo in repos:
207             name = repo['name']
208             if name == 'localrepo':
209                 if self.remote:
210                     url = self.repoconfig.get_localrepo(remote=True)['baseurl']
211                     yum.add_repo(name, apply_jenkins_auth(url))
212                 else:
213                     url = self.repoconfig.get_localrepo(remote=False)['baseurl']
214                     yum.add_repo(name, url)
215             else:
216                 yum.add_repo(name, repo['baseurl'])
217         yum_available_output = yum.read_all_packages()
218         available_rpms = YumInfoParser().parse_available(yum_available_output)
219         rpms_per_repo = {}
220         for rpm in available_rpms:
221             repo = rpm.get('Repo')
222             if repo not in rpms_per_repo:
223                 rpms_per_repo[repo] = []
224             rpms_per_repo[repo].append(rpm)
225         return rpms_per_repo
226
227     def _log_repo_rpm_statistics(self):
228         logging.info('--- RPM repo statistics ---')
229         for repo in self.repos:
230             name = repo['name']
231             repo_url = repo['baseurl']
232             if name in [r['name'] for r in self._get_nonerepos()]:
233                 expected_from_repo = None
234             else:
235                 expected_from_repo = name
236             repo_installed_rpm_count = len([rpm for rpm in self.installed_rpms if
237                                             rpm['Repo data']['baseurl'] == repo_url and rpm.get(
238                                                 'From repo') == expected_from_repo])
239             logging.info(
240                 'RPMs installed from repo "{}": {}'.format(name, repo_installed_rpm_count))
241             if repo_installed_rpm_count is 0:
242                 logging.warning(
243                     'Repository configured but no RPMs installed: {}={}'.format(name, repo_url))
244
245         return self.installed_rpms
246
247     def _log_rpm_statistics(self):
248         def _get_count(func):
249             return len([rpm for rpm in self.installed_rpms if func(rpm)])
250
251         logging.info('----- RPMs per type -----')
252         logging.info(' => Total: %s', len(self.installed_rpms))
253         logging.info('----- RPMs per attribute -----')
254         logging.info(' * Crypto capable: %s', _get_count(lambda rpm: rpm['Crypto capable']))
255         logging.info(' * Complex (BOM): %s', _get_count(lambda rpm: rpm['BOM']))
256
257     def _get_rpm_available_in(self, rpm, available_rpms):
258         if 'From repo' in rpm.keys():
259             if rpm['From repo'] in LOCAL_REPOS:
260                 return 'localrepo'
261             available_repo_rpms = available_rpms[rpm['From repo']]
262             for a_rpm in available_repo_rpms:
263                 if self._is_same_rpm(a_rpm, rpm):
264                     return rpm['From repo']
265             rpms_in_matching_repo = [str(a_rpm) for a_rpm in available_repo_rpms]
266             rpms_with_matching_name = [str(a_rpm) for a_rpm in available_repo_rpms if
267                                        rpm['Name'] == a_rpm['Name']]
268             if len(rpms_in_matching_repo) <= 1000:
269                 logging.debug(
270                     'Available RPMs in {}: {}'.format(rpm['From repo'], rpms_in_matching_repo))
271             error_str = 'RPM "{}" is not available in configured repo: {}, ' \
272                         'RPMs with correct name: {}'.format(str(rpm), rpm['From repo'],
273                                                             rpms_with_matching_name)
274             raise Exception(error_str)
275         else:
276             none_repos = self._get_nonerepos()
277             for repo in [r['name'] for r in none_repos]:
278                 for a_rpm in available_rpms[repo]:
279                     if self._is_same_rpm(a_rpm, rpm):
280                         return repo
281             msg = 'RPM "{}" is not available in any configured "none*" repos: {}'.format(
282                 rpm['Name'], none_repos)
283             raise Exception(msg)
284
285     def _get_nonerepos(self):
286         return [repo for repo in self.repos if re.match(r'^none\d+$', repo['name'])]
287
288     @staticmethod
289     def _is_same_rpm(rpm1, rpm2):
290         return rpm1['Name'] == rpm2['Name'] and \
291                rpm1['Version'] == rpm2['Version'] and \
292                rpm1['Release'] == rpm2['Release'] and \
293                rpm1['Arch'] == rpm2['Architecture']
294
295
296 def parse(args):
297     p = argparse.ArgumentParser(
298         description='Generate package info',
299         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
300     p.add_argument('--verbose', '-v', action='store_true',
301                    help='More verbose logging')
302     p.add_argument('--yum-info-path', required=True,
303                    help='"yum info all" output as file')
304     p.add_argument('--rpm-info-path', required=True,
305                    help='"rpm -qai" output as file')
306     p.add_argument('--crypto-info-path',
307                    help='Dir from where to find ECC file')
308     p.add_argument('--boms-path',
309                    help='Dir from where to find RPM bill of material files')
310     p.add_argument('--output-rpmlist',
311                    help='output as rpm list like "rpm-qa"')
312     p.add_argument('--output-json',
313                    help='output json file path')
314     p.add_argument('--output-csv',
315                    help='output csv file path')
316     p.add_argument('--output-ms-csv',
317                    help='output Microsoft Excel compatible csv file path')
318     p.add_argument('--build-config-path', required=True,
319                    help='Build configuration ini path')
320     p.add_argument('--remote', action='store_true',
321                    help='Read localrepo from remote defined by BUILD_URL, '
322                         'otherwise use localrepo from WORKSPACE')
323     args = p.parse_args(args)
324     return args
325
326
327 def read_files(boms_dir):
328     boms = {}
329     for f in os.listdir(boms_dir):
330         boms[f] = read_json(boms_dir + '/' + f)
331     return boms
332
333
334 def main(input_args):
335     args = parse(input_args)
336     if args.verbose:
337         set_logging(debug=True, timestamps=True)
338     else:
339         set_logging(debug=False)
340     rpmdata = RpmDataBuilder(args.build_config_path,
341                              read_from(args.yum_info_path),
342                              read_from(args.rpm_info_path),
343                              read_from(args.crypto_info_path),
344                              read_files(args.boms_path),
345                              remote=args.remote).run()
346     if args.output_rpmlist:
347         write_to(args.output_rpmlist, '\n'.join(sorted([str(rpm) for rpm in rpmdata])))
348     if args.output_json:
349         write_to(args.output_json, to_json(rpmdata))
350     csv = CsvConverter(rpmdata, preferred_field_order=['Name', 'Version', 'Release',
351                                                        'License', 'Vendor', 'From repo',
352                                                        'Source RPM'])
353     if args.output_csv:
354         write_to(args.output_csv, str(csv))
355     if args.output_ms_csv:
356         write_to(args.output_ms_csv,
357                  csv.convert_to_ms_excel(text_fields=['Version', 'Size', 'Release']))
358     if not args.output_json and not args.output_csv:
359         print(rpmdata)
360
361
362 if __name__ == "__main__":
363     main(sys.argv[1:])