import json


def batch_infer_check_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        total = 0
        custom_id_set = set()
        max = 50000
        for line in file:
            if line.strip() == "":
                continue
            try:
                line_dict = json.loads(line)
            except json.decoder.JSONDecodeError:
                raise Exception(f"输入文件错误，第{total + 1}行非json数据")
            if not line_dict.get("custom_id"):
                raise Exception(f"输入文件错误，第{total + 1}行custom_id不存在")
            if not isinstance(line_dict.get("custom_id"), str):
                raise Exception(f"输入文件错误, 第{total + 1}行custom_id不是string")
            if line_dict.get("custom_id") in custom_id_set:
                raise Exception(
                    f"输入文件错误，custom_id={line_dict.get('custom_id', '')}存在重复"
                )
            else:
                custom_id_set.add(line_dict.get("custom_id"))
            if not isinstance(line_dict.get("body", ""), dict):
                raise Exception(
                    f"输入文件错误，custom_id={line_dict.get('custom_id', '')}的body非json字符串"
                )
            if total > max:
                raise Exception(
                    f"输入文件错误：请求超过5万条"
                )
            total += 1
    return total


# 替换<YOUR_JSONL_FILE>为你的JSONL文件路径
file_path = "./demo-dev.jsonl"
total_lines = batch_infer_check_file(file_path)
print(f"文件中有效JSON数据的行数为: {total_lines}")
