#!/usr/bin/env python3
import zipfile
import xml.etree.ElementTree as ET
import json
import re

def extract_excel_content(file_path):
    """提取Excel文件内容"""
    
    print("🔍 开始分析Excel文件...")
    print(f"文件: {file_path}")
    
    try:
        # 解压Excel文件（xlsx实际上是zip压缩包）
        with zipfile.ZipFile(file_path, 'r') as z:
            # 获取文件列表
            file_list = z.namelist()
            print(f"\n📁 文件包含 {len(file_list)} 个内部文件")
            
            # 查找关键文件
            shared_strings = None
            sheet_files = []
            
            for f in file_list:
                if 'sharedStrings.xml' in f:
                    shared_strings = f
                elif 'sheet' in f and f.endswith('.xml'):
                    sheet_files.append(f)
                elif 'workbook.xml' in f:
                    workbook_file = f
            
            print(f"共享字符串文件: {shared_strings}")
            print(f"工作表文件: {sheet_files}")
            
            # 读取共享字符串（如果有）
            strings = []
            if shared_strings:
                with z.open(shared_strings) as f:
                    content = f.read().decode('utf-8')
                    # 简单提取文本
                    text_matches = re.findall(r'<t[^>]*>([^<]+)</t>', content)
                    strings = text_matches
                    print(f"\n📝 找到 {len(strings)} 个文本字符串")
                    if strings:
                        print("示例字符串:")
                        for i, s in enumerate(strings[:10]):
                            print(f"  {i+1}. {s}")
            
            # 读取第一个工作表
            if sheet_files:
                sheet_file = sheet_files[0]
                with z.open(sheet_file) as f:
                    content = f.read().decode('utf-8')
                    
                    # 提取单元格数据
                    cells = re.findall(r'<c[^>]*>.*?<v>([^<]+)</v>', content, re.DOTALL)
                    
                    print(f"\n📊 工作表 '{sheet_file}' 分析:")
                    print(f"单元格数量: {len(cells)}")
                    
                    if cells:
                        print("单元格值示例:")
                        for i, cell in enumerate(cells[:20]):
                            # 如果是数字引用，尝试从共享字符串获取
                            if cell.isdigit() and strings:
                                idx = int(cell)
                                if idx < len(strings):
                                    print(f"  {i+1}. {strings[idx]}")
                                else:
                                    print(f"  {i+1}. [索引 {cell}]")
                            else:
                                print(f"  {i+1}. {cell}")
            
            # 保存提取的内容
            extracted_data = {
                'file_name': file_path.split('/')[-1],
                'shared_strings': strings,
                'sheet_count': len(sheet_files),
                'sheet_files': sheet_files
            }
            
            with open('/root/.openclaw/agents/101/workspace/excel_extracted.json', 'w', encoding='utf-8') as f:
                json.dump(extracted_data, f, ensure_ascii=False, indent=2)
            
            print(f"\n✅ 内容提取完成！")
            print(f"数据已保存到: /root/.openclaw/agents/101/workspace/excel_extracted.json")
            
            return extracted_data
            
    except Exception as e:
        print(f"❌ 处理文件时出错: {e}")
        return None

if __name__ == "__main__":
    file_path = "/root/.openclaw/media/inbound/ä¼_ä_å¾_ä_è_½å_é_ªè_å_æ_å_é---86c8a8c3-f1bc-47fc-88e5-fb1bd39dc7fc.xlsx"
    extract_excel_content(file_path)