diff --git a/composer.json b/composer.json index 5d43e7a..40f11dd 100644 --- a/composer.json +++ b/composer.json @@ -20,8 +20,7 @@ "php": ">=5.0" }, "require-dev": { - "phpunit/phpunit": "5.6.*", - "phpdocumentor/phpdocumentor": "2.9.*" + "phpunit/phpunit": "^9.6" }, "autoload": { "files": ["lib/Mixpanel.php"] diff --git a/lib/Base/MixpanelBase.php b/lib/Base/MixpanelBase.php index 2264706..659e6f1 100644 --- a/lib/Base/MixpanelBase.php +++ b/lib/Base/MixpanelBase.php @@ -22,7 +22,9 @@ class Base_MixpanelBase { "people_endpoint" => "/engage", // host relative endpoint for people updates "groups_endpoint" => "/groups", // host relative endpoint for groups updates "use_ssl" => true, // use ssl when available - "error_callback" => null // callback to use on consumption failures + "error_callback" => null, // callback to use on consumption failures + "bot_detection" => false, // enable AI bot classification + "bot_additional_patterns" => array() // additional bot patterns ); diff --git a/lib/BotClassifier/AiBotClassifier.php b/lib/BotClassifier/AiBotClassifier.php new file mode 100644 index 0000000..697c081 --- /dev/null +++ b/lib/BotClassifier/AiBotClassifier.php @@ -0,0 +1,62 @@ +_patterns = array_merge($additional_bots, BotClassifier_AiBotDatabase::getDatabase()); + } + + /** + * Classify a user-agent string against the AI bot database. + * @param string|null $user_agent + * @return array Classification result with '$is_ai_bot' (always present) and optional + * '$ai_bot_name', '$ai_bot_provider', '$ai_bot_category' + */ + public function classify($user_agent) { + if ($user_agent === null || $user_agent === "" || !is_string($user_agent)) { + return array('$is_ai_bot' => false); + } + foreach ($this->_patterns as $bot) { + $match = @preg_match($bot["pattern"], $user_agent); + if ($match === false) { + continue; // Invalid regex, skip this pattern + } + if ($match) { + return array( + '$is_ai_bot' => true, + '$ai_bot_name' => $bot["name"], + '$ai_bot_provider' => $bot["provider"], + '$ai_bot_category' => $bot["category"] + ); + } + } + return array('$is_ai_bot' => false); + } + + /** + * Create a classifier with optional additional bot patterns (checked before built-in). + * @param array $options Options array with optional 'additional_bots' key + * @return BotClassifier_AiBotClassifier + */ + public static function createClassifier($options = array()) { + $additional = isset($options["additional_bots"]) ? $options["additional_bots"] : array(); + return new BotClassifier_AiBotClassifier($additional); + } + + /** + * Return bot database for inspection (no regex patterns exposed). + * @return array + */ + public function getBotDatabase() { + return BotClassifier_AiBotDatabase::getDatabaseForInspection(); + } +} diff --git a/lib/BotClassifier/AiBotDatabase.php b/lib/BotClassifier/AiBotDatabase.php new file mode 100644 index 0000000..79819e5 --- /dev/null +++ b/lib/BotClassifier/AiBotDatabase.php @@ -0,0 +1,77 @@ + "/GPTBot\//i", "name" => "GPTBot", + "provider" => "OpenAI", "category" => "indexing", + "description" => "OpenAI web crawler for model training data"), + array("pattern" => "/ChatGPT-User\//i", "name" => "ChatGPT-User", + "provider" => "OpenAI", "category" => "retrieval", + "description" => "ChatGPT real-time retrieval for user queries (RAG)"), + array("pattern" => "/OAI-SearchBot\//i", "name" => "OAI-SearchBot", + "provider" => "OpenAI", "category" => "indexing", + "description" => "OpenAI search indexing crawler"), + // === Anthropic === + array("pattern" => "/ClaudeBot\//i", "name" => "ClaudeBot", + "provider" => "Anthropic", "category" => "indexing", + "description" => "Anthropic web crawler for model training"), + array("pattern" => "/Claude-User\//i", "name" => "Claude-User", + "provider" => "Anthropic", "category" => "retrieval", + "description" => "Claude real-time retrieval for user queries"), + // === Google === + array("pattern" => "/Google-Extended\//i", "name" => "Google-Extended", + "provider" => "Google", "category" => "indexing", + "description" => "Google AI training data crawler (separate from Googlebot)"), + // === Perplexity === + array("pattern" => "/PerplexityBot\//i", "name" => "PerplexityBot", + "provider" => "Perplexity", "category" => "retrieval", + "description" => "Perplexity AI search crawler"), + // === ByteDance === + array("pattern" => "/Bytespider\//i", "name" => "Bytespider", + "provider" => "ByteDance", "category" => "indexing", + "description" => "ByteDance/TikTok AI crawler"), + // === Common Crawl === + array("pattern" => "/CCBot\//i", "name" => "CCBot", + "provider" => "Common Crawl", "category" => "indexing", + "description" => "Common Crawl bot (data used by many AI models)"), + // === Apple === + array("pattern" => "/Applebot-Extended\//i", "name" => "Applebot-Extended", + "provider" => "Apple", "category" => "indexing", + "description" => "Apple AI/Siri training data crawler"), + // === Meta === + array("pattern" => "/Meta-ExternalAgent\//i", "name" => "Meta-ExternalAgent", + "provider" => "Meta", "category" => "indexing", + "description" => "Meta/Facebook AI training data crawler"), + // === Cohere === + array("pattern" => "/cohere-ai\//i", "name" => "cohere-ai", + "provider" => "Cohere", "category" => "indexing", + "description" => "Cohere AI training data crawler"), + ); + + /** @return array */ + public static function getDatabase() { + return self::$_database; + } + + /** + * Returns database entries without regex patterns (safe for inspection). + * @return array + */ + public static function getDatabaseForInspection() { + $result = array(); + foreach (self::$_database as $entry) { + $result[] = array( + "name" => $entry["name"], + "provider" => $entry["provider"], + "category" => $entry["category"], + "description" => $entry["description"] + ); + } + return $result; + } +} diff --git a/lib/ConsumerStrategies/BotClassifyingConsumer.php b/lib/ConsumerStrategies/BotClassifyingConsumer.php new file mode 100644 index 0000000..3ee5ac3 --- /dev/null +++ b/lib/ConsumerStrategies/BotClassifyingConsumer.php @@ -0,0 +1,62 @@ + "ConsumerStrategies_FileConsumer", + "curl" => "ConsumerStrategies_CurlConsumer", + "socket" => "ConsumerStrategies_SocketConsumer" + ); + + function __construct($options = array()) { + parent::__construct($options); + $inner_key = isset($options["bot_classifying_inner_consumer"]) + ? $options["bot_classifying_inner_consumer"] : "curl"; + // NOTE: Do NOT merge $options["consumers"] into $_consumers here. + // The "consumers" key in $options is also consumed by Producers_MixpanelBaseProducer, + // and merging it would include "bot_classifying" => self, risking self-instantiation. + // Only the three hardcoded consumer types (file, curl, socket) are valid inner consumers. + $InnerClass = $this->_consumers[$inner_key]; + $this->_innerConsumer = new $InnerClass($options); + $additional_bots = isset($options["bot_additional_patterns"]) + ? $options["bot_additional_patterns"] : array(); + $this->_classifier = new BotClassifier_AiBotClassifier($additional_bots); + if (isset($options["bot_user_agent_property"])) { + $this->_userAgentProperty = $options["bot_user_agent_property"]; + } + } + + /** + * Classify bot user-agents in each message and forward to inner consumer. + * @param array $batch + * @return boolean + */ + public function persist($batch) { + foreach ($batch as &$message) { + if (isset($message["properties"]) && isset($message["properties"][$this->_userAgentProperty])) { + $classification = $this->_classifier->classify($message["properties"][$this->_userAgentProperty]); + $message["properties"] = array_merge($message["properties"], $classification); + } + } + unset($message); + return $this->_innerConsumer->persist($batch); + } + + /** @return int */ + public function getNumThreads() { + return $this->_innerConsumer->getNumThreads(); + } +} diff --git a/lib/Mixpanel.php b/lib/Mixpanel.php index 632bbd7..dda32b9 100644 --- a/lib/Mixpanel.php +++ b/lib/Mixpanel.php @@ -4,6 +4,8 @@ require_once(dirname(__FILE__) . "/Producers/MixpanelPeople.php"); require_once(dirname(__FILE__) . "/Producers/MixpanelEvents.php"); require_once(dirname(__FILE__) . "/Producers/MixpanelGroups.php"); +require_once(dirname(__FILE__) . "/BotClassifier/AiBotClassifier.php"); +require_once(dirname(__FILE__) . "/ConsumerStrategies/BotClassifyingConsumer.php"); /** * This is the main class for the Mixpanel PHP Library which provides all of the methods you need to track events, @@ -123,6 +125,9 @@ class Mixpanel extends Base_MixpanelBase { */ private $_events; + /** @var BotClassifier_AiBotClassifier|null */ + private $_botClassifier = null; + /** * An instance of the MixpanelGroups class (used to create/update group profiles) * @var Producers_MixpanelPeople @@ -148,6 +153,12 @@ public function __construct($token, $options = array()) { $this->people = new Producers_MixpanelPeople($token, $options); $this->_events = new Producers_MixpanelEvents($token, $options); $this->group = new Producers_MixpanelGroups($token, $options); + // Initialize bot classifier if bot_detection is enabled + if (isset($this->_options["bot_detection"]) && $this->_options["bot_detection"]) { + $additional_bots = isset($this->_options["bot_additional_patterns"]) + ? $this->_options["bot_additional_patterns"] : array(); + $this->_botClassifier = new BotClassifier_AiBotClassifier($additional_bots); + } } @@ -200,6 +211,15 @@ public function reset() { } + /** + * Get the events queue (delegates to the events producer). + * @return array + */ + public function getQueue() { + return $this->_events->getQueue(); + } + + /** * Identify the user you want to associate to tracked events. The $anon_id must be UUID v4 format and not already merged to an $identified_id. * All identify calls with a new and valid $anon_id will trigger a track $identify event, and merge to the $identified_id. @@ -216,9 +236,28 @@ public function identify($user_id, $anon_id = null) { * @param array $properties */ public function track($event, $properties = array()) { + if ($this->_botClassifier !== null + && isset($properties['$user_agent']) + && !$this->_isUsingBotClassifyingConsumer()) { + $classification = $this->_botClassifier->classify($properties['$user_agent']); + $properties = array_merge($properties, $classification); + } $this->_events->track($event, $properties); } + /** + * Check if the configured consumer is BotClassifyingConsumer to avoid double-classification. + * @return bool + */ + private function _isUsingBotClassifyingConsumer() { + if (!isset($this->_options['consumers']) || !is_array($this->_options['consumers'])) { + return false; + } + $consumerKey = $this->_options['consumer']; + return isset($this->_options['consumers'][$consumerKey]) + && $this->_options['consumers'][$consumerKey] === 'ConsumerStrategies_BotClassifyingConsumer'; + } + /** * Register a property to be sent with every event. diff --git a/phpunit.xml.dist b/phpunit.xml.dist index 81c67c7..a4d0163 100644 --- a/phpunit.xml.dist +++ b/phpunit.xml.dist @@ -1,29 +1,16 @@ - + verbose="false" + bootstrap="vendor/autoload.php"> + ./test/ - - - - examples - vendor - test - - + diff --git a/test/Base/MixpanelBaseProducerTest.php b/test/Base/MixpanelBaseProducerTest.php index f7beca6..c791a33 100644 --- a/test/Base/MixpanelBaseProducerTest.php +++ b/test/Base/MixpanelBaseProducerTest.php @@ -1,19 +1,21 @@ _file = dirname(__FILE__)."/output-".time().".txt"; $this->_instance = new _Producers_MixpanelBaseProducer("token", array("consumer" => "file", "debug" => true, "file" => $this->_file)); } - protected function tearDown() { + protected function tearDown(): void { parent::tearDown(); $this->_instance->reset(); $this->_instance = null; diff --git a/test/BotClassifier/AiBotClassifierTest.php b/test/BotClassifier/AiBotClassifierTest.php new file mode 100644 index 0000000..33b8ce6 --- /dev/null +++ b/test/BotClassifier/AiBotClassifierTest.php @@ -0,0 +1,284 @@ +_classifier = new BotClassifier_AiBotClassifier(); + } + + protected function tearDown(): void { + parent::tearDown(); + $this->_classifier = null; + } + + // === POSITIVE MATCHES — OpenAI === + + public function testClassifiesGPTBot() { + $result = $this->_classifier->classify( + "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)" + ); + $this->assertTrue($result['$is_ai_bot']); + $this->assertEquals("GPTBot", $result['$ai_bot_name']); + $this->assertEquals("OpenAI", $result['$ai_bot_provider']); + $this->assertEquals("indexing", $result['$ai_bot_category']); + } + + public function testClassifiesChatGPTUser() { + $result = $this->_classifier->classify( + "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ChatGPT-User/1.0; +https://openai.com/bot)" + ); + $this->assertTrue($result['$is_ai_bot']); + $this->assertEquals("ChatGPT-User", $result['$ai_bot_name']); + $this->assertEquals("OpenAI", $result['$ai_bot_provider']); + $this->assertEquals("retrieval", $result['$ai_bot_category']); + } + + public function testClassifiesOAISearchBot() { + $result = $this->_classifier->classify( + "Mozilla/5.0 (compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot)" + ); + $this->assertTrue($result['$is_ai_bot']); + $this->assertEquals("OAI-SearchBot", $result['$ai_bot_name']); + $this->assertEquals("OpenAI", $result['$ai_bot_provider']); + $this->assertEquals("indexing", $result['$ai_bot_category']); + } + + // === POSITIVE MATCHES — Anthropic === + + public function testClassifiesClaudeBot() { + $result = $this->_classifier->classify( + "Mozilla/5.0 (compatible; ClaudeBot/1.0; +claudebot@anthropic.com)" + ); + $this->assertTrue($result['$is_ai_bot']); + $this->assertEquals("ClaudeBot", $result['$ai_bot_name']); + $this->assertEquals("Anthropic", $result['$ai_bot_provider']); + $this->assertEquals("indexing", $result['$ai_bot_category']); + } + + public function testClassifiesClaudeUser() { + $result = $this->_classifier->classify("Mozilla/5.0 (compatible; Claude-User/1.0)"); + $this->assertTrue($result['$is_ai_bot']); + $this->assertEquals("Claude-User", $result['$ai_bot_name']); + $this->assertEquals("Anthropic", $result['$ai_bot_provider']); + $this->assertEquals("retrieval", $result['$ai_bot_category']); + } + + // === POSITIVE MATCHES — Google, Perplexity, ByteDance, Common Crawl, Apple, Meta, Cohere === + + public function testClassifiesGoogleExtended() { + $result = $this->_classifier->classify("Mozilla/5.0 (compatible; Google-Extended/1.0)"); + $this->assertTrue($result['$is_ai_bot']); + $this->assertEquals("Google-Extended", $result['$ai_bot_name']); + $this->assertEquals("Google", $result['$ai_bot_provider']); + $this->assertEquals("indexing", $result['$ai_bot_category']); + } + + public function testClassifiesPerplexityBot() { + $result = $this->_classifier->classify("Mozilla/5.0 (compatible; PerplexityBot/1.0)"); + $this->assertTrue($result['$is_ai_bot']); + $this->assertEquals("PerplexityBot", $result['$ai_bot_name']); + $this->assertEquals("Perplexity", $result['$ai_bot_provider']); + $this->assertEquals("retrieval", $result['$ai_bot_category']); + } + + public function testClassifiesBytespider() { + $result = $this->_classifier->classify("Mozilla/5.0 (compatible; Bytespider/1.0)"); + $this->assertTrue($result['$is_ai_bot']); + $this->assertEquals("Bytespider", $result['$ai_bot_name']); + $this->assertEquals("ByteDance", $result['$ai_bot_provider']); + $this->assertEquals("indexing", $result['$ai_bot_category']); + } + + public function testClassifiesCCBot() { + $result = $this->_classifier->classify("CCBot/2.0 (https://commoncrawl.org/faq/)"); + $this->assertTrue($result['$is_ai_bot']); + $this->assertEquals("CCBot", $result['$ai_bot_name']); + $this->assertEquals("Common Crawl", $result['$ai_bot_provider']); + $this->assertEquals("indexing", $result['$ai_bot_category']); + } + + public function testClassifiesApplebotExtended() { + $result = $this->_classifier->classify( + "Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Applebot-Extended/0.1" + ); + $this->assertTrue($result['$is_ai_bot']); + $this->assertEquals("Applebot-Extended", $result['$ai_bot_name']); + $this->assertEquals("Apple", $result['$ai_bot_provider']); + $this->assertEquals("indexing", $result['$ai_bot_category']); + } + + public function testClassifiesMetaExternalAgent() { + $result = $this->_classifier->classify("Mozilla/5.0 (compatible; Meta-ExternalAgent/1.0)"); + $this->assertTrue($result['$is_ai_bot']); + $this->assertEquals("Meta-ExternalAgent", $result['$ai_bot_name']); + $this->assertEquals("Meta", $result['$ai_bot_provider']); + $this->assertEquals("indexing", $result['$ai_bot_category']); + } + + public function testClassifiesCohereAi() { + $result = $this->_classifier->classify("Mozilla/5.0 (compatible; cohere-ai/1.0)"); + $this->assertTrue($result['$is_ai_bot']); + $this->assertEquals("cohere-ai", $result['$ai_bot_name']); + $this->assertEquals("Cohere", $result['$ai_bot_provider']); + $this->assertEquals("indexing", $result['$ai_bot_category']); + } + + // === NEGATIVE CASES === + + public function testNotAiBotChrome() { + $result = $this->_classifier->classify( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + ); + $this->assertFalse($result['$is_ai_bot']); + $this->assertArrayNotHasKey('$ai_bot_name', $result); + } + + public function testNotAiBotGooglebot() { + $result = $this->_classifier->classify( + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" + ); + $this->assertFalse($result['$is_ai_bot']); + } + + public function testNotAiBotBingbot() { + $result = $this->_classifier->classify( + "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)" + ); + $this->assertFalse($result['$is_ai_bot']); + } + + public function testNotAiBotCurl() { + $result = $this->_classifier->classify("curl/7.64.1"); + $this->assertFalse($result['$is_ai_bot']); + } + + public function testEmptyUserAgent() { + $result = $this->_classifier->classify(""); + $this->assertFalse($result['$is_ai_bot']); + } + + public function testNullUserAgent() { + $result = $this->_classifier->classify(null); + $this->assertFalse($result['$is_ai_bot']); + } + + // === CASE SENSITIVITY === + + public function testCaseInsensitiveMatching() { + $result = $this->_classifier->classify("mozilla/5.0 (compatible; gptbot/1.2)"); + $this->assertTrue($result['$is_ai_bot']); + $this->assertEquals("GPTBot", $result['$ai_bot_name']); + } + + public function testCaseInsensitiveClaudeBot() { + $result = $this->_classifier->classify("claudebot/1.0"); + $this->assertTrue($result['$is_ai_bot']); + $this->assertEquals("ClaudeBot", $result['$ai_bot_name']); + } + + // === RETURN SHAPE VALIDATION === + + public function testMatchReturnsAllExpectedKeys() { + $result = $this->_classifier->classify("GPTBot/1.2"); + $this->assertArrayHasKey('$is_ai_bot', $result); + $this->assertArrayHasKey('$ai_bot_name', $result); + $this->assertArrayHasKey('$ai_bot_provider', $result); + $this->assertArrayHasKey('$ai_bot_category', $result); + $this->assertTrue(is_string($result['$ai_bot_name'])); + $this->assertTrue(is_string($result['$ai_bot_provider'])); + $this->assertTrue( + in_array($result['$ai_bot_category'], array("indexing", "retrieval", "agent")) + ); + } + + public function testNonMatchReturnsOnlyIsAiBot() { + $result = $this->_classifier->classify("Mozilla/5.0 Chrome/120"); + $this->assertEquals(array('$is_ai_bot'), array_keys($result)); + $this->assertFalse($result['$is_ai_bot']); + } + + // === CUSTOM BOT REGISTRATION === + + public function testCustomBotPatternIsRecognized() { + $classifier = BotClassifier_AiBotClassifier::createClassifier(array( + "additional_bots" => array( + array( + "pattern" => "/MyCustomBot\//i", + "name" => "MyCustomBot", + "provider" => "CustomCorp", + "category" => "indexing" + ) + ) + )); + $result = $classifier->classify("Mozilla/5.0 (compatible; MyCustomBot/1.0)"); + $this->assertTrue($result['$is_ai_bot']); + $this->assertEquals("MyCustomBot", $result['$ai_bot_name']); + } + + public function testCustomBotTakesPriorityOverBuiltIn() { + $classifier = BotClassifier_AiBotClassifier::createClassifier(array( + "additional_bots" => array( + array( + "pattern" => "/GPTBot\//i", + "name" => "GPTBot-Custom", + "provider" => "CustomProvider", + "category" => "retrieval" + ) + ) + )); + $result = $classifier->classify("GPTBot/1.2"); + $this->assertEquals("GPTBot-Custom", $result['$ai_bot_name']); + $this->assertEquals("CustomProvider", $result['$ai_bot_provider']); + } + + // === BOT DATABASE INSPECTION === + + public function testGetBotDatabaseReturnsArray() { + $db = $this->_classifier->getBotDatabase(); + $this->assertTrue(is_array($db)); + $this->assertGreaterThan(0, count($db)); + } + + public function testGetBotDatabaseEntriesHaveRequiredFields() { + $db = $this->_classifier->getBotDatabase(); + foreach ($db as $entry) { + $this->assertArrayHasKey("name", $entry); + $this->assertArrayHasKey("provider", $entry); + $this->assertArrayHasKey("category", $entry); + $this->assertTrue( + in_array($entry["category"], array("indexing", "retrieval", "agent")) + ); + } + } + + public function testGetBotDatabaseHasAtLeast12Entries() { + $db = $this->_classifier->getBotDatabase(); + $this->assertGreaterThanOrEqual(12, count($db)); + } + + // === PATTERN VERIFICATION (via getDatabase() directly) === + + public function testRawDatabaseEntriesHavePatternKey() { + $db = BotClassifier_AiBotDatabase::getDatabase(); + $this->assertGreaterThan(0, count($db)); + foreach ($db as $index => $entry) { + $this->assertArrayHasKey("pattern", $entry, + "Bot database entry at index $index is missing 'pattern' key" + ); + $this->assertTrue(is_string($entry["pattern"]), + "Bot database entry at index $index 'pattern' must be a string" + ); + // Verify the pattern is a valid regex + $this->assertNotFalse(@preg_match($entry["pattern"], ""), + "Bot database entry at index $index has invalid regex pattern: " . $entry["pattern"] + ); + } + } +} diff --git a/test/BotClassifier/BotClassifyingIntegrationTest.php b/test/BotClassifier/BotClassifyingIntegrationTest.php new file mode 100644 index 0000000..ac94cd0 --- /dev/null +++ b/test/BotClassifier/BotClassifyingIntegrationTest.php @@ -0,0 +1,179 @@ +_instance = new Mixpanel("test-token", array( + "bot_detection" => true + )); + } + + protected function tearDown(): void { + parent::tearDown(); + $this->_instance->reset(); + $this->_instance = null; + } + + // === CORE CLASSIFICATION VIA track() === + + public function testEnrichesTrackCallsWhenUserAgentPresent() { + $this->_instance->track("page_view", array( + '$user_agent' => 'Mozilla/5.0 (compatible; GPTBot/1.2; +https://openai.com/gptbot)', + 'distinct_id' => 'user123' + )); + $queue = $this->_instance->getQueue(); + $this->assertEquals(1, count($queue)); + $this->assertEquals("page_view", $queue[0]['event']); + $props = $queue[0]['properties']; + $this->assertTrue($props['$is_ai_bot']); + $this->assertEquals("GPTBot", $props['$ai_bot_name']); + $this->assertEquals("OpenAI", $props['$ai_bot_provider']); + $this->assertEquals("indexing", $props['$ai_bot_category']); + } + + public function testNoClassificationWhenUserAgentAbsent() { + $this->_instance->track("page_view", array( + 'distinct_id' => 'user123', 'page' => '/home' + )); + $queue = $this->_instance->getQueue(); + $props = $queue[0]['properties']; + $this->assertArrayNotHasKey('$is_ai_bot', $props); + $this->assertArrayNotHasKey('$ai_bot_name', $props); + } + + public function testIsAiBotFalseWhenUserAgentIsNotBot() { + $this->_instance->track("page_view", array( + '$user_agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0', + 'distinct_id' => 'user123' + )); + $queue = $this->_instance->getQueue(); + $props = $queue[0]['properties']; + $this->assertFalse($props['$is_ai_bot']); + $this->assertArrayNotHasKey('$ai_bot_name', $props); + } + + // === PROPERTY PRESERVATION === + + public function testPreservesUserProperties() { + $this->_instance->track("page_view", array( + '$user_agent' => 'GPTBot/1.2', + 'page_url' => '/products', 'custom_prop' => 'value', 'distinct_id' => 'user123' + )); + $props = $this->_instance->getQueue()[0]['properties']; + $this->assertEquals("/products", $props['page_url']); + $this->assertEquals("value", $props['custom_prop']); + $this->assertEquals("user123", $props['distinct_id']); + $this->assertTrue($props['$is_ai_bot']); + } + + public function testSuperPropertiesMergeCorrectly() { + $this->_instance->register("platform", "web"); + $this->_instance->register("app_version", "2.0"); + $this->_instance->track("page_view", array( + '$user_agent' => 'GPTBot/1.2', 'distinct_id' => 'user123' + )); + $props = $this->_instance->getQueue()[0]['properties']; + $this->assertEquals("web", $props['platform']); + $this->assertEquals("2.0", $props['app_version']); + $this->assertTrue($props['$is_ai_bot']); + } + + public function testSdkDefaultPropertiesStillAdded() { + $this->_instance->track("page_view", array('$user_agent' => 'GPTBot/1.2')); + $props = $this->_instance->getQueue()[0]['properties']; + $this->assertEquals("test-token", $props['token']); + $this->assertArrayHasKey("time", $props); + $this->assertEquals("php", $props['mp_lib']); + } + + // === BOT DETECTION DISABLED === + + public function testNoClassificationWhenBotDetectionDisabled() { + $mp = new Mixpanel("test-token", array("bot_detection" => false)); + $mp->track("page_view", array( + '$user_agent' => 'GPTBot/1.2', 'distinct_id' => 'user123' + )); + $props = $mp->getQueue()[0]['properties']; + $this->assertArrayNotHasKey('$is_ai_bot', $props); + $mp->reset(); + } + + public function testNoClassificationWhenBotDetectionNotSet() { + $mp = new Mixpanel("test-token"); + $mp->track("page_view", array( + '$user_agent' => 'GPTBot/1.2', 'distinct_id' => 'user123' + )); + $props = $mp->getQueue()[0]['properties']; + $this->assertArrayNotHasKey('$is_ai_bot', $props); + $mp->reset(); + } + + // === MULTIPLE BOT TYPES === + + public function testMultipleBotTypesClassifiedCorrectly() { + $bots = array( + array('GPTBot/1.2', 'GPTBot', 'OpenAI'), + array('ClaudeBot/1.0', 'ClaudeBot', 'Anthropic'), + array('PerplexityBot/1.0', 'PerplexityBot', 'Perplexity'), + array('CCBot/2.0', 'CCBot', 'Common Crawl'), + ); + foreach ($bots as $bot) { + $this->_instance->reset(); + $this->_instance->track("page_view", array('$user_agent' => $bot[0])); + $props = $this->_instance->getQueue()[0]['properties']; + $this->assertTrue($props['$is_ai_bot'], "Failed for " . $bot[0]); + $this->assertEquals($bot[1], $props['$ai_bot_name'], "Wrong name for " . $bot[0]); + $this->assertEquals($bot[2], $props['$ai_bot_provider'], "Wrong provider for " . $bot[0]); + } + } + + // === CONSUMER WRAPPER APPROACH === + + public function testConsumerWrapperClassifiesEvents() { + $file = dirname(__FILE__) . "/consumer-test-" . time() . ".txt"; + $mp = new Mixpanel("test-token", array( + "consumers" => array("bot_classifying" => "ConsumerStrategies_BotClassifyingConsumer"), + "consumer" => "bot_classifying", + "bot_classifying_inner_consumer" => "file", + "file" => $file + )); + $mp->track("page_view", array( + '$user_agent' => 'GPTBot/1.2', 'distinct_id' => 'user123' + )); + $queue = $mp->getQueue(); + $this->assertEquals(1, count($queue)); + $this->assertEquals("page_view", $queue[0]['event']); + // Classification happens in the consumer during flush, not at track() time + $mp->flush(); + $this->assertFileExists($file); + $contents = file_get_contents($file); + $this->assertStringContainsString('$is_ai_bot', $contents); + $this->assertStringContainsString('$ai_bot_name', $contents); + $this->assertStringContainsString('$ai_bot_provider', $contents); + $this->assertStringContainsString('$ai_bot_category', $contents); + $mp->reset(); + if (file_exists($file)) { unlink($file); } + } + + // === EVENT NAME + IDENTIFY === + + public function testEventNamePreserved() { + $this->_instance->track("custom_event_name", array('$user_agent' => 'GPTBot/1.2')); + $this->assertEquals("custom_event_name", $this->_instance->getQueue()[0]['event']); + } + + public function testIdentifyStillWorksWithBotDetection() { + $this->_instance->identify("user123"); + $this->_instance->track("page_view", array('$user_agent' => 'GPTBot/1.2')); + $props = $this->_instance->getQueue()[0]['properties']; + $this->assertEquals("user123", $props['distinct_id']); + $this->assertTrue($props['$is_ai_bot']); + } +} diff --git a/test/ConsumerStrategies/AbstractConsumerTest.php b/test/ConsumerStrategies/AbstractConsumerTest.php index 5605cbf..5993c6d 100644 --- a/test/ConsumerStrategies/AbstractConsumerTest.php +++ b/test/ConsumerStrategies/AbstractConsumerTest.php @@ -1,19 +1,21 @@ _instance = new AbstractConsumer(); } - protected function tearDown() + protected function tearDown(): void { parent::tearDown(); $this->_instance = null; diff --git a/test/ConsumerStrategies/CurlConsumerTest.php b/test/ConsumerStrategies/CurlConsumerTest.php index aa2b886..d65538a 100644 --- a/test/ConsumerStrategies/CurlConsumerTest.php +++ b/test/ConsumerStrategies/CurlConsumerTest.php @@ -1,6 +1,8 @@ _file = dirname(__FILE__)."/output-".time().".txt"; $this->_instance = new ConsumerStrategies_FileConsumer(array("file" => $this->_file)); } - protected function tearDown() + protected function tearDown(): void { parent::tearDown(); $this->_instance = null; diff --git a/test/ConsumerStrategies/SocketConsumerTest.php b/test/ConsumerStrategies/SocketConsumerTest.php index 2115b8d..c0f24a6 100644 --- a/test/ConsumerStrategies/SocketConsumerTest.php +++ b/test/ConsumerStrategies/SocketConsumerTest.php @@ -1,13 +1,15 @@ _instance = new ConsumerStrategies_SocketConsumer(array( @@ -18,7 +20,7 @@ protected function setUp() )); } - protected function tearDown() + protected function tearDown(): void { parent::tearDown(); $this->_instance = null; diff --git a/test/MixpanelTest.php b/test/MixpanelTest.php index 9380612..45b8178 100644 --- a/test/MixpanelTest.php +++ b/test/MixpanelTest.php @@ -1,18 +1,20 @@ _instance = Mixpanel::getInstance("token"); } - protected function tearDown() { + protected function tearDown(): void { parent::tearDown(); $this->_instance->reset(); $this->_instance = null; diff --git a/test/Producers/MixpanelEventsProducerTest.php b/test/Producers/MixpanelEventsProducerTest.php index b38172b..b866b5f 100644 --- a/test/Producers/MixpanelEventsProducerTest.php +++ b/test/Producers/MixpanelEventsProducerTest.php @@ -1,19 +1,21 @@ _instance = new Producers_MixpanelEvents("token"); } - protected function tearDown() + protected function tearDown(): void { parent::tearDown(); $this->_instance->reset(); diff --git a/test/Producers/MixpanelGroupsProducerTest.php b/test/Producers/MixpanelGroupsProducerTest.php index 8c735ef..630c454 100644 --- a/test/Producers/MixpanelGroupsProducerTest.php +++ b/test/Producers/MixpanelGroupsProducerTest.php @@ -1,19 +1,21 @@ _instance = new Producers_MixpanelGroups("token"); } - protected function tearDown() + protected function tearDown(): void { parent::tearDown(); $this->_instance->reset(); diff --git a/test/Producers/MixpanelPeopleProducerTest.php b/test/Producers/MixpanelPeopleProducerTest.php index e748e26..77e2453 100644 --- a/test/Producers/MixpanelPeopleProducerTest.php +++ b/test/Producers/MixpanelPeopleProducerTest.php @@ -1,19 +1,21 @@ _instance = new Producers_MixpanelPeople("token"); } - protected function tearDown() + protected function tearDown(): void { parent::tearDown(); $this->_instance->reset();