From c3114534fcc25ecaf1cfcd6de80acf47868ad44b Mon Sep 17 00:00:00 2001 From: TannedCung Date: Mon, 17 Jun 2024 19:05:20 +0700 Subject: [PATCH 1/4] Fix: misleading avg, update accuracy formula --- cope2n-api/fwd_api/utils/accuracy.py | 43 +++++++++++++++++++--------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/cope2n-api/fwd_api/utils/accuracy.py b/cope2n-api/fwd_api/utils/accuracy.py index ef88970..8e23fbd 100755 --- a/cope2n-api/fwd_api/utils/accuracy.py +++ b/cope2n-api/fwd_api/utils/accuracy.py @@ -18,8 +18,10 @@ from ..models import SubscriptionRequest, Report, ReportFile import json from typing import Union, List, Dict -valid_keys = ["retailername", "sold_to_party", "invoice_no", "purchase_date", "imei_number"] -optional_keys = ['invoice_no'] +VALID_KEYS = ["retailername", "sold_to_party", "invoice_no", "purchase_date", "imei_number"] +KEYS_BY_FILE_TYPE = {"imei": ["imei_number"], + "invoice": ["retailername", "invoice_no", "purchase_date"]} +OPTIONAL_KEYS = ['invoice_no'] class ReportAccumulateByRequest: def __init__(self, sub): @@ -123,6 +125,7 @@ class ReportAccumulateByRequest: "review_progress": [] }, self.report = copy.deepcopy(self.month_format) + self.report["average_accuracy_rate"]["avg"] = IterAvg() @staticmethod def update_total(total, report_file): @@ -142,8 +145,10 @@ class ReportAccumulateByRequest: for key in settings.FIELD: if sum([len(report_file.reviewed_accuracy[x]) for x in report_file.reviewed_accuracy.keys() if "_count" not in x]) > 0 : total["average_accuracy_rate"][key].add(report_file.reviewed_accuracy.get(key, [])) + total["average_accuracy_rate"]['avg'].add(report_file.reviewed_accuracy.get(key, [])) elif sum([len(report_file.feedback_accuracy[x]) for x in report_file.feedback_accuracy.keys() if "_count" not in x]) > 0: total["average_accuracy_rate"][key].add(report_file.feedback_accuracy.get(key, [])) + total["average_accuracy_rate"]['avg'].add(report_file.feedback_accuracy.get(key, [])) total["feedback_accuracy"][key].add(report_file.feedback_accuracy.get(key, [])) total["reviewed_accuracy"][key].add(report_file.reviewed_accuracy.get(key, [])) @@ -346,17 +351,17 @@ class ReportAccumulateByRequest: for key in _report["average_processing_time"].keys(): _report["average_processing_time"][key] = _report["average_processing_time"][key]() - avg_acc = 0 - count_acc = 0 + # avg_acc = 0 + # count_acc = 0 for key in settings.FIELD: _report["average_accuracy_rate"][key] = _report["average_accuracy_rate"][key]() for accuracy_type in ["feedback_accuracy", "reviewed_accuracy"]: - if (_report[accuracy_type][key].count + count_acc) > 0: - avg_acc = (avg_acc*count_acc + _report[accuracy_type][key].avg*_report[accuracy_type][key].count) / (_report[accuracy_type][key].count + count_acc) - count_acc += _report[accuracy_type][key].count + # if (_report[accuracy_type][key].count + count_acc) > 0: + # avg_acc = (avg_acc*count_acc + _report[accuracy_type][key].avg*_report[accuracy_type][key].count) / (_report[accuracy_type][key].count + count_acc) + # count_acc += _report[accuracy_type][key].count _report[accuracy_type][key] = _report[accuracy_type][key]() - _report["average_accuracy_rate"]["avg"] = avg_acc + _report["average_accuracy_rate"]["avg"] = _report["average_accuracy_rate"]["avg"]() _report["review_progress"] = _report["review_progress"].count(1)/(_report["review_progress"].count(0)+ _report["review_progress"].count(1)) if (_report["review_progress"].count(0)+ _report["review_progress"].count(1)) >0 else 0 _report["images_quality"]["successful_percent"] = _report["images_quality"]["successful"]/_report["total_images"] if _report["total_images"] > 0 else 0 @@ -734,9 +739,11 @@ def _accuracy_calculate_formatter(inference, target): Make both list inference and target to be the same length. """ if not isinstance(inference, list): - inference = [] if inference is None else [inference] + # inference = [] if inference is None else [inference] + inference = [inference] if not isinstance(target, list): - target = [] if target is None else [target] + # target = [] if target is None else [target] + target = [target] length = max(len(target), len(inference)) target = target + (length - len(target))*[target[0]] if len(target) > 0 else target + (length - len(target))*[None] @@ -745,7 +752,7 @@ def _accuracy_calculate_formatter(inference, target): return inference, target def _acc_will_be_ignored(key_name, _target): - is_optional_key = key_name in optional_keys + is_optional_key = key_name in OPTIONAL_KEYS is_empty_target = _target in [[], None, ''] if is_optional_key and is_empty_target: return True @@ -1043,7 +1050,15 @@ def calculate_subcription_file(subcription_request_file): feedback_result = copy.deepcopy(subcription_request_file.feedback_result) reviewed_result = copy.deepcopy(subcription_request_file.reviewed_result) - for key_name in valid_keys: + accuracy_keys_for_this_image = KEYS_BY_FILE_TYPE.get(subcription_request_file.doc_type, []) + + for key_name in VALID_KEYS: + att["acc"]["feedback"][key_name] = [] + att["normalized_data"]["feedback"][key_name] = [] + att["acc"]["reviewed"][key_name] = [] + att["normalized_data"]["reviewed"][key_name] = [] + + for key_name in accuracy_keys_for_this_image: try: att["acc"]["feedback"][key_name], att["normalized_data"]["feedback"][key_name] = calculate_accuracy(key_name, inference_result, feedback_result, "feedback", sub=subcription_request_file.request.subsidiary) att["acc"]["reviewed"][key_name], att["normalized_data"]["reviewed"][key_name] = calculate_accuracy(key_name, inference_result, reviewed_result, "reviewed", sub=subcription_request_file.request.subsidiary) @@ -1052,8 +1067,8 @@ def calculate_subcription_file(subcription_request_file): subcription_request_file.feedback_accuracy = att["acc"]["feedback"] subcription_request_file.reviewed_accuracy = att["acc"]["reviewed"] - avg_reviewed = calculate_avg_accuracy(att["acc"], "reviewed", valid_keys) - avg_feedback = calculate_avg_accuracy(att["acc"], "feedback", valid_keys) + avg_reviewed = calculate_avg_accuracy(att["acc"], "reviewed", VALID_KEYS) + avg_feedback = calculate_avg_accuracy(att["acc"], "feedback", VALID_KEYS) if avg_feedback is not None or avg_reviewed is not None: avg_acc = 0 From 0fb8d5457309475dc46a74380d67ca04141b15c2 Mon Sep 17 00:00:00 2001 From: TannedCung Date: Tue, 18 Jun 2024 10:55:10 +0700 Subject: [PATCH 2/4] Fix: purchase_date review --- cope2n-api/fwd_api/api/accuracy_view.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cope2n-api/fwd_api/api/accuracy_view.py b/cope2n-api/fwd_api/api/accuracy_view.py index 2d5cd85..f46af55 100755 --- a/cope2n-api/fwd_api/api/accuracy_view.py +++ b/cope2n-api/fwd_api/api/accuracy_view.py @@ -987,7 +987,7 @@ class AccuracyViewSet(viewsets.ViewSet): if isinstance(v, str) and v == '': v = None if k == "purchase_date" and v is not None: - v = v.split("") + v = v.split(",") if not isinstance(sample_result[k], list): sample_result[k] = v elif v: From 74431c3bc9e7166d593d60c192a2211a85d7ce5d Mon Sep 17 00:00:00 2001 From: TannedCung Date: Thu, 20 Jun 2024 15:02:10 +0700 Subject: [PATCH 3/4] Update: Accuracy fomula --- cope2n-api/fwd/settings.py | 7 ++ .../celery_worker/process_report_tasks.py | 12 ++-- ...se-populate-redemption-id-to-subsidiary.py | 65 +++++++++++++++++++ cope2n-api/fwd_api/utils/accuracy.py | 18 +++-- 4 files changed, 87 insertions(+), 15 deletions(-) create mode 100644 cope2n-api/fwd_api/management/commands/migrate-datebase-populate-redemption-id-to-subsidiary.py diff --git a/cope2n-api/fwd/settings.py b/cope2n-api/fwd/settings.py index 34f9655..48270c6 100755 --- a/cope2n-api/fwd/settings.py +++ b/cope2n-api/fwd/settings.py @@ -238,6 +238,13 @@ SUBS = { "SEAO": "seao" } +FIELDS_BY_SUB = { + "SG": {"imei": ["imei_number"], + "invoice": ["retailername", "purchase_date"]}, + "default": {"imei": ["imei_number"], + "invoice": ["retailername", "invoice_no", "purchase_date"]}, + } + BAD_THRESHOLD = 0.75 NEED_REVIEW = 1.0 diff --git a/cope2n-api/fwd_api/celery_worker/process_report_tasks.py b/cope2n-api/fwd_api/celery_worker/process_report_tasks.py index ea01557..9667d2f 100755 --- a/cope2n-api/fwd_api/celery_worker/process_report_tasks.py +++ b/cope2n-api/fwd_api/celery_worker/process_report_tasks.py @@ -126,11 +126,13 @@ def create_accuracy_report(report_id, **kwargs): "invoice_no": mean_list(request_att["acc"]["reviewed"].get("invoice_no", [None]))} rq_accuracy = [] - for rpf in _report_files: - if sum(len(value_list) for value_list in rpf.reviewed_accuracy.values()): - rq_accuracy += list(chain(*rpf.reviewed_accuracy.values())) - elif sum(len(value_list) for value_list in rpf.feedback_accuracy.values()): - rq_accuracy += list(chain(*rpf.feedback_accuracy.values())) + for i, _att in enumerate(_atts): + if _report_files[i].bad_image_reason in settings.ACC_EXCLUDE_RESEASONS: + continue + if sum(len(value_list) for value_list in _att["acc"]["reviewed"].values()): + rq_accuracy += list(chain(*_att["acc"]["reviewed"].values())) + elif sum(len(value_list) for value_list in _att["acc"]["feedback"].values()): + rq_accuracy += list(chain(*_att["acc"]["feedback"].values())) request.is_required = False if len(rq_accuracy) > 0: diff --git a/cope2n-api/fwd_api/management/commands/migrate-datebase-populate-redemption-id-to-subsidiary.py b/cope2n-api/fwd_api/management/commands/migrate-datebase-populate-redemption-id-to-subsidiary.py new file mode 100644 index 0000000..c40b4b1 --- /dev/null +++ b/cope2n-api/fwd_api/management/commands/migrate-datebase-populate-redemption-id-to-subsidiary.py @@ -0,0 +1,65 @@ +from django.core.management.base import BaseCommand +from tqdm import tqdm +from fwd_api.models import SubscriptionRequestFile, SubscriptionRequest +from fwd_api.exception.exceptions import InvalidException +from fwd_api.utils.accuracy import predict_result_to_ready +import traceback +import copy +from django.utils import timezone + +KEY = "imei_number" +VALUE = "None" +EXPECTED_VALUE = [] + +class Command(BaseCommand): + help = 'Refactor database for image level' + + def add_arguments(self, parser): + # Add your command-line arguments here + parser.add_argument('start', type=str, help='start date, sample: 2023-01-02T00:00:00+0700') + parser.add_argument('end', type=str, help='end date, sample: 2023-01-03T00:00:00+0700') + + def process_request(self, request, result): + if len(request.request_id.split(".")[0].split("_")) < 2: + return + images = SubscriptionRequestFile.objects.filter(request=request) + if not request.predict_result: + # self.stdout.write(self.style.WARNING(f"Key predict_result not found in {request.request_id}")) + return + if request.predict_result.get("status", 200) != 200: + # self.stdout.write(self.style.WARNING(f"Not a sucess request {request.request_id}")) + return + + if isinstance(request.redemption_id, str) and request.subsidiary is not None: + try: + request.subsidiary = request.redemption_id[:2] + request.save() + result['total'] += 1 + result['subs'].add(request.redemption_id[:2]) + except Exception as e: + print(e) + result["failed"] += 1 + + def handle(self, *args, **options): + start = options['start'] + end = options['end'] + result = {'total':0, + 'failed':0, + 'subs': set()} + if start or end: + try: + start_date = timezone.datetime.strptime(start, '%Y-%m-%dT%H:%M:%S%z') # We care only about day precision only + end_date = timezone.datetime.strptime(end, '%Y-%m-%dT%H:%M:%S%z') + except Exception as e: + print(f"[INFO]: start: {start}") + print(f"[INFO]: end: {end}") + raise InvalidException(excArgs="Date format") + subcription_iter = SubscriptionRequest.objects.filter(created_at__range=(start_date, end_date)) + else: + subcription_iter = SubscriptionRequest.objects.all() + + # file = open("modified.txt", "w") + for request in tqdm(subcription_iter.iterator()): + self.process_request(request, result) + # file.close() + self.stdout.write(self.style.SUCCESS('Sample Django management command executed successfully! total/failed: {}/{} - subs: {}'.format(result['total'], result['failed'], result['subs']))) \ No newline at end of file diff --git a/cope2n-api/fwd_api/utils/accuracy.py b/cope2n-api/fwd_api/utils/accuracy.py index 8e23fbd..8329894 100755 --- a/cope2n-api/fwd_api/utils/accuracy.py +++ b/cope2n-api/fwd_api/utils/accuracy.py @@ -19,8 +19,6 @@ import json from typing import Union, List, Dict VALID_KEYS = ["retailername", "sold_to_party", "invoice_no", "purchase_date", "imei_number"] -KEYS_BY_FILE_TYPE = {"imei": ["imei_number"], - "invoice": ["retailername", "invoice_no", "purchase_date"]} OPTIONAL_KEYS = ['invoice_no'] class ReportAccumulateByRequest: @@ -776,7 +774,7 @@ def calculate_accuracy(key_name: str, inference: Dict[str, Union[str, List]], ta _inference = inference[key_name] _target = target[key_name] - _will_acc_be_ignored = _acc_will_be_ignored(key_name, _target) + # _will_acc_be_ignored = _acc_will_be_ignored(key_name, _target) _inference, _target = _accuracy_calculate_formatter(_inference, _target) for i, v in enumerate(_inference): @@ -793,8 +791,7 @@ def calculate_accuracy(key_name: str, inference: Dict[str, Union[str, List]], ta # "line_acc", # "one_minus_ned_word", ]) - if not _will_acc_be_ignored: - acc.append(list(score.values())[0]) + acc.append(list(score.values())[0]) data.append([x, y]) return acc, data @@ -908,12 +905,11 @@ def calculate_a_request(report, request): images = SubscriptionRequestFile.objects.filter(request=request, file_category=FileCategory.Origin.value) report_files = [] for image in images: - status, att = calculate_subcription_file(image) - atts.append(att) + status, att = calculate_subcription_file(image, request.subsidiary) att["acc"]["feedback"], fb_max_indexes = acc_maximize_list_values(att["acc"]["feedback"]) att["acc"]["reviewed"], rv_max_indexes = acc_maximize_list_values(att["acc"]["reviewed"]) - _att = copy.deepcopy(att) + _att = copy.deepcopy(att) # deep copy right here to advoid removing acc for bad images in the next steps fb_avg_acc = avg_dict(att["acc"]["feedback"]) rv_avg_acc = avg_dict(att["acc"]["reviewed"]) @@ -924,6 +920,8 @@ def calculate_a_request(report, request): continue if status != 200: continue + + atts.append(att) image.feedback_accuracy = att["acc"]["feedback"] # dict {key: [values]} image.is_bad_image_quality = att["is_bad_image"] # is_bad_image=avg_acc Date: Fri, 21 Jun 2024 10:48:02 +0700 Subject: [PATCH 4/4] Add: refill S3 image by redemption script --- .../migrate-datebase-fillup-images.py | 174 ++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 cope2n-api/fwd_api/management/commands/migrate-datebase-fillup-images.py diff --git a/cope2n-api/fwd_api/management/commands/migrate-datebase-fillup-images.py b/cope2n-api/fwd_api/management/commands/migrate-datebase-fillup-images.py new file mode 100644 index 0000000..bdcf633 --- /dev/null +++ b/cope2n-api/fwd_api/management/commands/migrate-datebase-fillup-images.py @@ -0,0 +1,174 @@ +from django.core.management.base import BaseCommand +from tqdm import tqdm +from fwd_api.models import SubscriptionRequestFile, SubscriptionRequest +from fwd_api.exception.exceptions import InvalidException +from fwd_api.utils.s3 import MinioS3Client + +import copy +import os +import glob +import traceback +import copy +import json +from django.utils import timezone + +IMAGE_DIRS = ["/external_data/SGGE", "/external_data/zipsGwp1", "/external_data/zipsGwp2", "/external_data/zipsGwp3", "/external_data/zipsGwp4", "/external_data/zipsEvoucher"] +# IMAGE_DIRS = ["/external_data/SGGE"] +image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.gif'] +pdf_extensions = ['*.pdf'] + +class Command(BaseCommand): + help = 'Refactor database for image level' + + def add_arguments(self, parser): + # Add your command-line arguments here + parser.add_argument('start', type=str, help='start date, sample: 2023-01-02T00:00:00+0700') + parser.add_argument('end', type=str, help='end date, sample: 2023-01-03T00:00:00+0700') + + def _prepare_data(self, redemtion_dirs): + prepared_data = {} # {"redemption_id": {"image_paths": []}, "pages": 1} + for redemtion_dir in redemtion_dirs: + redemptions = os.listdir(redemtion_dir) + for redemption in redemptions: + files_in_dir = [] + for ext in image_extensions + pdf_extensions: + files_in_dir.extend(glob.glob(os.path.join(redemtion_dir, redemption, ext))) + redemption = redemption.replace("Data", "") + if prepared_data.get(redemption, None): + prepared_data[redemption]["image_paths"] += files_in_dir + prepared_data[redemption]["pages"] += len(files_in_dir) + else: + prepared_data[redemption] = {"image_paths": files_in_dir, "pages": len(files_in_dir)} + + return prepared_data + + def _add_error(self, result, error, redemption_id): + if not result.get("Error", None): + result["Error"] = {} + if result["Error"].get(error, None): + result["Error"][error].add(redemption_id) + else: + result["Error"][error] = set([redemption_id]) + + def _add_info(self, result, info, redemption_id): + if not result.get("Info", None): + result["Info"] = {} + if result["Info"].get(info, None): + result["Info"][info].add(redemption_id) + else: + result["Info"][info] = set([redemption_id]) + + def _add_warning(self, result, warn, redemption_id): + if not result.get("Warning", None): + result["Warning"] = {} + if result["Warning"].get(warn, None): + result["Warning"][warn].add(redemption_id) + else: + result["Warning"][warn] = set([redemption_id]) + + def _try_find_doc_type(self, file_paths): + doc_types = {"invoice": [], + "imei": [], + "undefined": []} + for file_path in file_paths: + if "invoice" in os.path.basename(file_path): + doc_types["invoice"].append(file_path) + elif "imei" in os.path.basename(file_path): + doc_types["imei"].append(file_path) + else: + doc_types["undefined"].append(file_path) + return doc_types + + def process_request(self, request, data, result, s3_client): + if not request.predict_result: + # self.stdout.write(self.style.WARNING(f"Key predict_result not found in {request.request_id}")) + return + if request.predict_result.get("status", 200) != 200: + # self.stdout.write(self.style.WARNING(f"Not a sucess request {request.request_id}")) + return + # Find to coresponding redemption_ID + self._add_info(result, "[OCR]: redemptions", request.redemption_id) + if request.redemption_id not in list(data.keys()): + self._add_error(result, "[OCR]: Not found redemption_ID", request.redemption_id) + return + if request.pages != data[request.redemption_id]["pages"]: + self._add_error(result, "[SBT]: Mismatch files number in a request", request.redemption_id) + return + + file_paths_by_doc_type = self._try_find_doc_type(data[request.redemption_id]["image_paths"]) + if len(file_paths_by_doc_type["undefined"]) > 0: + self._add_warning(result, "[SBT]: Undefined doc type", request.redemption_id) + + if len(request.request_id.split(".")[0].split("_")) < 2: + return + images = SubscriptionRequestFile.objects.filter(request=request, file_category="Origin") + + for i, image in enumerate(images): + if image.doc_type not in ["imei", "invoice"]: + self._add_error(result, "[OCR]: Weird doc type", request.redemption_id) + continue + try: + if len(file_paths_by_doc_type[image.doc_type]) > 0: + local_file_path = file_paths_by_doc_type[image.doc_type].pop(0) + else: + local_file_path = file_paths_by_doc_type["undefined"].pop(0) + predir = "sbt_invoice" + s3_key = os.path.join(predir, request.request_id, image.file_name) + # s3_client.upload_file(local_file_path, s3_key) + result['total'] += 1 + self._add_info(result, "[OCR]: Success", request.redemption_id) + except IndexError as e: + self._add_error(result, "[OCR]: Mismatch doc type", request.redemption_id) + continue + except Exception as e: + self.stdout.write(self.style.ERROR(f"Request: {request.request_id} failed with {e}")) + print(traceback.format_exc()) + result['failed'] += 1 + self._add_info(result, "[OCR]: Failed", request.redemption_id) + continue + data.pop(request.redemption_id, None) + + def handle(self, *args, **options): + start = options['start'] + end = options['end'] + result = {'total':0, + 'failed':0} + # TODO: redemption ID is not null on filter + if start or end: + try: + start_date = timezone.datetime.strptime(start, '%Y-%m-%dT%H:%M:%S%z') # We care only about day precision only + end_date = timezone.datetime.strptime(end, '%Y-%m-%dT%H:%M:%S%z') + except Exception as e: + print(f"[INFO]: start: {start}") + print(f"[INFO]: end: {end}") + raise InvalidException(excArgs="Date format") + subcription_iter = SubscriptionRequest.objects.filter(created_at__range=(start_date, end_date), redemption_id__isnull=False) + else: + subcription_iter = SubscriptionRequest.objects.filter(redemption_id__isnull=False) + print(f"[INFO]: Preparing data for filling up...") + prepared_data = self._prepare_data(IMAGE_DIRS) + print(f"[INFO]: Prepared data, total: {len(list(prepared_data.keys()))}") + prepared_data_copy = copy.deepcopy(prepared_data) + s3_client = MinioS3Client( + # endpoint='http://107.120.133.27:9884', + access_key='AKIA3AFPFVWZHTZHB6FW', + secret_key='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', + bucket_name='ocr-sds' + ) + # file = open("modified.txt", "w") + for request in tqdm(subcription_iter.iterator()): + self.process_request(request, prepared_data_copy, result, s3_client) + # file.close() + self.stdout.write(self.style.SUCCESS('Sample Django management command executed successfully! total/failed: {}/{}'.format(result['total'], result['failed']))) + # print(f"[INFO]: result: {result}") + for err in result.get("Error", []): + print("[INFO]: Error: {}: {}".format(err, len(result["Error"][err]))) + result["Error"][err] = list(result["Error"][err]) + for info in result.get("Info", []): + print("[INFO]: Info: {}: {}".format(info, len(result["Info"][info]))) + result["Info"][info] = list(result["Info"][info]) + for warn in result.get("Warning", []): + print("[INFO]: Warning: {}: {}".format(warn, len(result["Warning"][warn]))) + result["Warning"][warn] = list(result["Warning"][warn]) + with open("result.json", "w") as outfile: + json.dump(result, outfile) \ No newline at end of file